diff --git a/.gitignore b/.gitignore index 1a87853fd635faaaab57453deb82fd1dc2410712..b7b760524ac57fc0a49b62498405c340cec3bb55 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,9 @@ .vs/ .vscode/ +lcov-report/ +gcovr-report/ + build*/ out/ tmp/ @@ -19,24 +22,34 @@ tmp/ models/* models-mnt -/main -/quantize -/quantize-stats -/result -/perplexity -/embedding -/train-text-from-scratch -/convert-llama2c-to-ggml -/simple -/benchmark-matmult -/vdot -/server /Pipfile +/baby-llama +/beam-search +/benchmark-matmult +/convert-llama2c-to-ggml /embd-input-test +/embedding /gguf /gguf-llama-simple /libllama.so /llama-bench +/main +/metal +/perplexity +/q8dot +/quantize +/quantize-stats +/result +/save-load-state +/server +/simple +/batched +/export-lora +/finetune +/speculative +/parallel +/train-text-from-scratch +/vdot build-info.h arm_neon.h compile_commands.json diff --git a/CMakeLists.txt b/CMakeLists.txt index 6efba527da8cd0397a0f9c7679400dac84bf0514..969ed2f2c9356394c482443a8382eff0469d32c6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -197,37 +197,38 @@ endif() if (LLAMA_ALL_WARNINGS) if (NOT MSVC) - set(c_flags - -Wall - -Wextra - -Wpedantic - -Wcast-qual - -Wdouble-promotion - -Wshadow - -Wstrict-prototypes - -Wpointer-arith - -Wmissing-prototypes - -Werror=implicit-int - -Wno-unused-function - ) - set(cxx_flags - -Wall - -Wextra - -Wpedantic - -Wcast-qual - -Wmissing-declarations - -Wno-unused-function - -Wno-multichar - ) - if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - # g++ only - set(cxx_flags ${cxx_flags} -Wno-format-truncation -Wno-array-bounds) + set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) + set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int + -Werror=implicit-function-declaration) + set(cxx_flags -Wmissing-declarations -Wmissing-noreturn) + + if (CMAKE_C_COMPILER_ID MATCHES "Clang") + set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return) + set(cxx_flags ${cxx_flags} -Wmissing-prototypes -Wextra-semi) + + if ( + (CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR + (CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3.0) + ) + set(c_flags ${c_flags} -Wdouble-promotion) + endif() + elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU") + set(c_flags ${c_flags} -Wdouble-promotion) + set(cxx_flags ${cxx_flags} -Wno-array-bounds) + + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0) + set(cxx_flags ${cxx_flags} -Wno-format-truncation) + endif() + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0) + set(cxx_flags ${cxx_flags} -Wextra-semi) + endif() endif() else() # todo : msvc endif() add_compile_options( + ${warning_flags} "$<$:${c_flags}>" "$<$:${cxx_flags}>" ) diff --git a/Dockerfile b/Dockerfile index 19a2fa619ddee48c698a05fdbda9d2df91a83ea3..f5c10fe618715a91cf3d2b130deced39ce92856e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,8 +4,8 @@ COPY . . RUN apt update \ && apt install build-essential wget libopenblas-dev make -y \ && make LLAMA_OPENBLAS=1 \ - && wget https://huggingface.co/TheBloke/Pygmalion-2-7B-GGUF/resolve/main/pygmalion-2-7b.Q4_0.gguf \ + && wget https://huggingface.co/notstoic/pygmalion-13b-ggml/resolve/main/pygmalion-13b-ggml-q4_0.bin \ && apt remove build-essential wget make -y \ - && rm -fr *.bat convert-* ci docs examples otherarchs tests + && rm -fr *.bat convert-* ci docs examples otherarchs tests - ENTRYPOINT ["python", "koboldcpp.py", "pygmalion-2-7b.Q4_0.gguf", "--port", "7860"] \ No newline at end of file + ENTRYPOINT ["python", "koboldcpp.py", "pygmalion-13b-ggml-q4_0.bin", "--port", "7860", "--smartcontext", "--stream"] \ No newline at end of file diff --git a/Makefile b/Makefile index 440ff611279f4213ae3f555563a423e980ee42c6..695a943eafb3708e79a8d0f516f7c9e05878db75 100644 --- a/Makefile +++ b/Makefile @@ -39,10 +39,15 @@ endif # # keep standard at C11 and C++11 -CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE -CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE +CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE +CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE LDFLAGS = +ifndef LLAMA_NO_K_QUANTS +CFLAGS += -DGGML_USE_K_QUANTS +CXXFLAGS += -DGGML_USE_K_QUANTS +endif + # these are used on windows, to build some libraries with extra old device compatibility SIMPLECFLAGS = FULLCFLAGS = @@ -285,19 +290,17 @@ ifeq ($(OS),Windows_NT) endif else DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS) - FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS) + ifdef LLAMA_OPENBLAS OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) - NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) endif ifdef LLAMA_CLBLAST ifeq ($(UNAME_S),Darwin) - CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) + CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) else - CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) + CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) endif endif - ifdef LLAMA_CUBLAS CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.so $(CUBLASLD_FLAGS) $(LDFLAGS) endif @@ -351,12 +354,20 @@ ggml_cublas.o: ggml.c ggml.h ggml-cuda.h k_quants.h $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ #quants K +KQ1 = +KQ2 = +KQ3 = +ifndef LLAMA_NO_K_QUANTS +KQ1 = k_quants.o +KQ2 = k_quants_noavx2.o +KQ3 = k_quants_failsafe.o k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@ k_quants_noavx2.o: k_quants.c k_quants.h ggml.h ggml-cuda.h $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@ k_quants_failsafe.o: k_quants.c k_quants.h ggml.h ggml-cuda.h $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@ +endif # LLAMA_NO_K_QUANTS #there's no intrinsics or special gpu ops used here, so we can have a universal object ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h @@ -416,7 +427,7 @@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER) clean: rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so -main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS) +main: examples/main/main.cpp build-info.h ggml.o $(KQ1) ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @@ -425,31 +436,69 @@ main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + #generated libraries -koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS) +koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS) $(DEFAULT_BUILD) -koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS) + +ifdef OPENBLAS_BUILD +koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS) $(OPENBLAS_BUILD) -koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o ggml-alloc.o grammar-parser.o $(OBJS) +else +koboldcpp_openblas: + $(DONOTHING) +endif + +ifdef FAILSAFE_BUILD +koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ3) ggml-alloc.o grammar-parser.o $(OBJS) $(FAILSAFE_BUILD) -koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o ggml-alloc.o grammar-parser.o $(OBJS) +else +koboldcpp_failsafe: + $(DONOTHING) +endif + +ifdef NOAVX2_BUILD +koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ2) ggml-alloc.o grammar-parser.o $(OBJS) $(NOAVX2_BUILD) -koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS) +else +koboldcpp_noavx2: + $(DONOTHING) +endif + +ifdef CLBLAST_BUILD +koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS) $(CLBLAST_BUILD) -koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS) +else +koboldcpp_clblast: + $(DONOTHING) +endif + +ifdef CUBLAS_BUILD +koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS) $(CUBLAS_BUILD) -koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(HIP_OBJS) $(OBJS) +else +koboldcpp_cublas: + $(DONOTHING) +endif + +ifdef HIPBLAS_BUILD +koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o grammar-parser.o $(HIP_OBJS) $(OBJS) $(HIPBLAS_BUILD) +else +koboldcpp_hipblas: + $(DONOTHING) +endif -quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-alloc.o +# tools +quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o $(KQ1) ggml-alloc.o $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -quantize_gptj: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp +quantize_gptj: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -quantize_gpt2: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp +quantize_gpt2: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -quantize_neox: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp +quantize_neox: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -quantize_mpt: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp +quantize_mpt: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) diff --git a/Package.swift b/Package.swift index fb95ef7ebc59f05ffa63c48e74eefd36f098fe65..5fbcdb9db9d42941091ac87651007fdd5a829daf 100644 --- a/Package.swift +++ b/Package.swift @@ -10,7 +10,7 @@ let platforms: [SupportedPlatform]? = [ .tvOS(.v14) ] let exclude: [String] = [] -let additionalSources: [String] = ["ggml-metal.m"] +let additionalSources: [String] = ["ggml-metal.m", "ggml-metal.metal"] let additionalSettings: [CSetting] = [ .unsafeFlags(["-fno-objc-arc"]), .define("GGML_SWIFT"), @@ -44,7 +44,9 @@ let package = Package( cSettings: [ .unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_K_QUANTS"), - .define("GGML_USE_ACCELERATE") + .define("GGML_USE_ACCELERATE"), + .define("ACCELERATE_NEW_LAPACK"), + .define("ACCELERATE_LAPACK_ILP64") ] + additionalSettings, linkerSettings: [ .linkedFramework("Accelerate") diff --git a/README.md b/README.md index 77ffe1d11702f82883809a3e0f3a20082140398f..9422d3ac9ad8f39dd3ee92f01a54fb0fa8acad8c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,84 @@ ---- -sdk: docker -emoji: 🚀 -colorFrom: yellow -colorTo: blue ---- \ No newline at end of file +# koboldcpp + +KoboldCpp is an easy-to-use AI text-generation software for GGML models. It's a single self contained distributable from Concedo, that builds off llama.cpp, and adds a versatile Kobold API endpoint, additional format support, backward compatibility, as well as a fancy UI with persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything Kobold and Kobold Lite have to offer. + +![Preview](media/preview.png) + +## Usage +- **[Download the latest .exe release here](https://github.com/LostRuins/koboldcpp/releases/latest)** or clone the git repo. +- Windows binaries are provided in the form of **koboldcpp.exe**, which is a pyinstaller wrapper for a few **.dll** files and **koboldcpp.py**. If you feel concerned, you may prefer to rebuild it yourself with the provided makefiles and scripts. +- Weights are not included, you can use the official llama.cpp `quantize.exe` to generate them from your official weight files (or download them from other places such as [TheBloke's Huggingface](https://huggingface.co/TheBloke). +- To run, execute **koboldcpp.exe** or drag and drop your quantized `ggml_model.bin` file onto the .exe, and then connect with Kobold or Kobold Lite. If you're not on windows, then run the script **KoboldCpp.py** after compiling the libraries. +- Launching with no command line arguments displays a GUI containing a subset of configurable settings. Generally you dont have to change much besides the `Presets` and `GPU Layers`. Read the `--help` for more info about each settings. +- By default, you can connect to http://localhost:5001 +- You can also run it using the command line `koboldcpp.exe [ggml_model.bin] [port]`. For info, please check `koboldcpp.exe --help` +- Default context size to small? Try `--contextsize 3072` to 1.5x your context size! without much perplexity gain. Note that you'll have to increase the max context in the Kobold Lite UI as well (click and edit the number text field). +- Big context too slow? Try the `--smartcontext` flag to reduce prompt processing frequency. Also, you can try to run with your GPU using CLBlast, with `--useclblast` flag for a speedup +- Want even more speedup? Combine `--useclblast` with `--gpulayers` to offload entire layers to the GPU! **Much faster, but uses more VRAM**. Experiment to determine number of layers to offload, and reduce by a few if you run out of memory. +- If you are having crashes or issues, you can try turning off BLAS with the `--noblas` flag. You can also try running in a non-avx2 compatibility mode with `--noavx2`. Lastly, you can try turning off mmap with `--nommap`. + +For more information, be sure to run the program with the `--help` flag. + +## OSX and Linux +- You will have to compile your binaries from source. A makefile is provided, simply run `make` +- If you want you can also link your own install of OpenBLAS manually with `make LLAMA_OPENBLAS=1` +- Alternatively, if you want you can also link your own install of CLBlast manually with `make LLAMA_CLBLAST=1`, for this you will need to obtain and link OpenCL and CLBlast libraries. + - For Arch Linux: Install `cblas` `openblas` and `clblast`. + - For Debian: Install `libclblast-dev` and `libopenblas-dev`. +- For a full featured build, do `make LLAMA_OPENBLAS=1 LLAMA_CLBLAST=1 LLAMA_CUBLAS=1` +- After all binaries are built, you can run the python script with the command `koboldcpp.py [ggml_model.bin] [port]` +- Note: Many OSX users have found that the using Accelerate is actually faster than OpenBLAS. To try, you may wish to run with `--noblas` and compare speeds. + +## Compiling on Windows +- You're encouraged to use the .exe released, but if you want to compile your binaries from source at Windows, the easiest way is: + - Use the latest release of w64devkit (https://github.com/skeeto/w64devkit). Be sure to use the "vanilla one", not i686 or other different stuff. If you try they will conflit with the precompiled libs! + - Make sure you are using the w64devkit integrated terminal, then run 'make' at the KoboldCpp source folder. This will create the .dll files. + - If you want to generate the .exe file, make sure you have the python module PyInstaller installed with pip ('pip install PyInstaller'). + - Run the script make_pyinstaller.bat at a regular terminal (or Windows Explorer). + - The koboldcpp.exe file will be at your dist folder. +- If you wish to use your own version of the additional Windows libraries (OpenCL, CLBlast and OpenBLAS), you can do it with: + - OpenCL - tested with https://github.com/KhronosGroup/OpenCL-SDK . If you wish to compile it, follow the repository instructions. You will need vcpkg. + - CLBlast - tested with https://github.com/CNugteren/CLBlast . If you wish to compile it you will need to reference the OpenCL files. It will only generate the ".lib" file if you compile using MSVC. + - OpenBLAS - tested with https://github.com/xianyi/OpenBLAS . + - Move the respectives .lib files to the /lib folder of your project, overwriting the older files. + - Also, replace the existing versions of the corresponding .dll files located in the project directory root (e.g. libopenblas.dll). + - Make the KoboldCPP project using the instructions above. + +## Android (Termux) Alternative method +- See https://github.com/ggerganov/llama.cpp/pull/1828/files + +## Using CuBLAS +- If you're on Windows with an Nvidia GPU you can get CUDA support out of the box using the `--usecublas` flag, make sure you select the correct .exe with CUDA support. +- You can attempt a CuBLAS build with `LLAMA_CUBLAS=1` or using the provided CMake file (best for visual studio users). If you use the CMake file to build, copy the `koboldcpp_cublas.dll` generated into the same directory as the `koboldcpp.py` file. If you are bundling executables, you may need to include CUDA dynamic libraries (such as `cublasLt64_11.dll` and `cublas64_11.dll`) in order for the executable to work correctly on a different PC. + +## AMD +- Please check out https://github.com/YellowRoseCx/koboldcpp-rocm + +## Questions and Help +- **First, please check out [The KoboldCpp FAQ and Knowledgebase](https://github.com/LostRuins/koboldcpp/wiki) which may already have answers to your questions! Also please search through past issues and discussions.** +- If you cannot find an answer, open an issue on this github, or find us on the [KoboldAI Discord](https://koboldai.org/discord). + +## Considerations +- For Windows: No installation, single file executable, (It Just Works) +- Since v1.0.6, requires libopenblas, the prebuilt windows binaries are included in this repo. If not found, it will fall back to a mode without BLAS. +- Since v1.15, requires CLBlast if enabled, the prebuilt windows binaries are included in this repo. If not found, it will fall back to a mode without CLBlast. +- Since v1.33, you can set the context size to be above what the model supports officially. It does increases perplexity but should still work well below 4096 even on untuned models. (For GPT-NeoX, GPT-J, and LLAMA models) Customize this with `--ropeconfig`. +- **I plan to keep backwards compatibility with ALL past llama.cpp AND alpaca.cpp models**. But you are also encouraged to reconvert/update your models if possible for best results. + +## License +- The original GGML library and llama.cpp by ggerganov are licensed under the MIT License +- However, Kobold Lite is licensed under the AGPL v3.0 License +- The other files are also under the AGPL v3.0 License unless otherwise stated + +## Notes +- Generation delay scales linearly with original prompt length. If OpenBLAS is enabled then prompt ingestion becomes about 2-3x faster. This is automatic on windows, but will require linking on OSX and Linux. CLBlast speeds this up even further, and `--gpulayers` + `--useclblast` or `--usecublas` more so. +- I have heard of someone claiming a false AV positive report. The exe is a simple pyinstaller bundle that includes the necessary python scripts and dlls to run. If this still concerns you, you might wish to rebuild everything from source code using the makefile, and you can rebuild the exe yourself with pyinstaller by using `make_pyinstaller.bat` +- Supported GGML models (Includes backward compatibility for older versions/legacy GGML models, though some newer features might be unavailable): + - LLAMA and LLAMA2 (LLaMA / Alpaca / GPT4All / Vicuna / Koala / Pygmalion 7B / Metharme 7B / WizardLM and many more) + - GPT-2 / Cerebras + - GPT-J + - RWKV + - GPT-NeoX / Pythia / StableLM / Dolly / RedPajama + - MPT models + - Falcon (GGUF only) + diff --git a/Remote-Link.cmd b/Remote-Link.cmd index c1c0d24fbc45b08bc46c50816daade9ee9ced3bd..a7c1f11096c1afb0f48c3ee38407902398e5f99e 100644 --- a/Remote-Link.cmd +++ b/Remote-Link.cmd @@ -1,2 +1,18 @@ -curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -o cloudflared.exe -cloudflared.exe tunnel --url localhost:5001 \ No newline at end of file +: # This script will help setup a cloudflared tunnel for accessing KoboldCpp over the internet +: # It should work out of the box on both linux and windows +: # ====== +: # WINDOWS PORTION +:<\"Open" - ] + "source": [] }, { "cell_type": "code", @@ -40,7 +38,7 @@ "source": [ "#@title v-- Enter your model below and then click this to start Koboldcpp\n", "\n", - "Model = \"https://huggingface.co/TheBloke/airoboros-l2-13B-gpt4-1.4.1-GGML/resolve/main/airoboros-l2-13b-gpt4-1.4.1.ggmlv3.q4_0.bin\" #@param [\"\"]{allow-input: true}\n", + "Model = \"https://huggingface.co/TheBloke/Airoboros-L2-13B-2.2-GGUF/resolve/main/airoboros-l2-13b-2.2.Q4_K_M.gguf\" #@param [\"\"]{allow-input: true}\n", "Layers = 43 #@param [43]{allow-input: true}\n", "\n", "%cd /content\n", @@ -54,7 +52,7 @@ "!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\n", "!sleep 10\n", "!cat nohup.out\n", - "!python koboldcpp.py model.ggml --stream --usecublas 0 --gpulayers $Layers --hordeconfig concedo\n" + "!python koboldcpp.py model.ggml --stream --usecublas 0 mmq --gpulayers $Layers --hordeconfig concedo\n" ] } ] diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index dead56118bac828008c0c1e3261b31c1b3d4666a..951aa8340c7e4bc12675a41f4a2a5829ead810f2 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -9,6 +9,8 @@ add_library(${TARGET} OBJECT console.cpp grammar-parser.h grammar-parser.cpp + train.h + train.cpp ) if (BUILD_SHARED_LIBS) diff --git a/common/common.cpp b/common/common.cpp index 2597ba06aee16584c3591250effc438b743b895b..ec181c6b3b61a81ff9a5de852c7bd09f483c7b04 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -78,7 +78,7 @@ int32_t get_num_physical_cores() { return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; } -static void process_escapes(std::string& input) { +void process_escapes(std::string& input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; @@ -129,6 +129,15 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { if (params.n_threads <= 0) { params.n_threads = std::thread::hardware_concurrency(); } + } else if (arg == "-tb" || arg == "--threads-batch") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads_batch = std::stoi(argv[i]); + if (params.n_threads_batch <= 0) { + params.n_threads_batch = std::thread::hardware_concurrency(); + } } else if (arg == "-p" || arg == "--prompt") { if (++i >= argc) { invalid_param = true; @@ -317,6 +326,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_chunks = std::stoi(argv[i]); + } else if (arg == "-np" || arg == "--parallel") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_parallel = std::stoi(argv[i]); + } else if (arg == "-ns" || arg == "--sequences") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_sequences = std::stoi(argv[i]); } else if (arg == "-m" || arg == "--model") { if (++i >= argc) { invalid_param = true; @@ -340,7 +361,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.lora_adapter = argv[i]; + params.lora_adapter.push_back({argv[i], 1.0f}); + params.use_mmap = false; + } else if (arg == "--lora-scaled") { + if (++i >= argc) { + invalid_param = true; + break; + } + const char * lora_adapter = argv[i]; + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])}); params.use_mmap = false; } else if (arg == "--lora-base") { if (++i >= argc) { @@ -360,6 +393,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.multiline_input = true; } else if (arg == "--simple-io") { params.simple_io = true; + } else if (arg == "-cb" || arg == "--cont-batching") { + params.cont_batching = true; } else if (arg == "--color") { params.use_color = true; } else if (arg == "--mlock") { @@ -425,19 +460,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.mul_mat_q = false; #else fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n"); -#endif // GGML_USE_CUBLAS - } else if (arg == "--low-vram" || arg == "-lv") { -#ifdef GGML_USE_CUBLAS - params.low_vram = true; -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); #endif // GGML_USE_CUBLAS } else if (arg == "--no-mmap") { params.use_mmap = false; } else if (arg == "--numa") { params.numa = true; - } else if (arg == "--export") { - params.export_cgraph = true; } else if (arg == "--verbose-prompt") { params.verbose_prompt = true; } else if (arg == "-r" || arg == "--reverse-prompt") { @@ -456,8 +483,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { if (params.logdir.back() != DIRECTORY_SEPARATOR) { params.logdir += DIRECTORY_SEPARATOR; } - } else if (arg == "--perplexity") { - params.perplexity = true; + } else if (arg == "--perplexity" || arg == "--all-logits") { + params.logits_all = true; } else if (arg == "--ppl-stride") { if (++i >= argc) { invalid_param = true; @@ -606,7 +633,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" (can be specified more than once for multiple prompts).\n"); printf(" --color colorise output to distinguish prompt and user input from generations\n"); printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); - printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads); + printf(" -tb N, --threads-batch N\n"); + printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); printf(" -p PROMPT, --prompt PROMPT\n"); printf(" prompt to start generation with (default: empty)\n"); printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); @@ -621,7 +650,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -f FNAME, --file FNAME\n"); printf(" prompt file to start generation.\n"); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); - printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); @@ -655,12 +684,15 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" --temp N temperature (default: %.1f)\n", (double)params.temp); - printf(" --perplexity compute perplexity over each ctx window of the prompt\n"); + printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n"); printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); + printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel); + printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences); + printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); if (llama_mlock_supported()) { printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); } @@ -678,17 +710,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -ts SPLIT --tensor-split SPLIT\n"); printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); - printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n"); #ifdef GGML_USE_CUBLAS printf(" -nommq, --no-mul-mat-q\n"); printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n"); printf(" Not recommended since this is both slower and uses more VRAM.\n"); #endif // GGML_USE_CUBLAS #endif - printf(" --export export the computation graph to 'llama.ggml'\n"); printf(" --verbose-prompt print prompt before generation\n"); fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); @@ -699,6 +730,18 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf("\n"); } +std::string get_system_info(const gpt_params & params) { + std::ostringstream os; + + os << "system_info: n_threads = " << params.n_threads; + if (params.n_threads_batch != -1) { + os << " (n_threads_batch = " << params.n_threads_batch << ")"; + } + os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info(); + + return os.str(); +} + std::string gpt_random_prompt(std::mt19937 & rng) { const int r = rng() % 10; switch (r) { @@ -712,60 +755,74 @@ std::string gpt_random_prompt(std::mt19937 & rng) { case 7: return "He"; case 8: return "She"; case 9: return "They"; - default: return "To"; } - return "The"; + GGML_UNREACHABLE(); } // // Model utils // -struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { - auto lparams = llama_context_default_params(); +struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) { + auto mparams = llama_model_default_params(); - lparams.n_ctx = params.n_ctx; - lparams.n_batch = params.n_batch; if (params.n_gpu_layers != -1) { - lparams.n_gpu_layers = params.n_gpu_layers; + mparams.n_gpu_layers = params.n_gpu_layers; } - lparams.main_gpu = params.main_gpu; - lparams.tensor_split = params.tensor_split; - lparams.low_vram = params.low_vram; - lparams.mul_mat_q = params.mul_mat_q; - lparams.seed = params.seed; - lparams.f16_kv = params.memory_f16; - lparams.use_mmap = params.use_mmap; - lparams.use_mlock = params.use_mlock; - lparams.logits_all = params.perplexity; - lparams.embedding = params.embedding; - lparams.rope_freq_base = params.rope_freq_base; - lparams.rope_freq_scale = params.rope_freq_scale; - - return lparams; + mparams.main_gpu = params.main_gpu; + mparams.tensor_split = params.tensor_split; + mparams.use_mmap = params.use_mmap; + mparams.use_mlock = params.use_mlock; + + return mparams; +} + +struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { + auto cparams = llama_context_default_params(); + + cparams.n_ctx = params.n_ctx; + cparams.n_batch = params.n_batch; + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + cparams.mul_mat_q = params.mul_mat_q; + cparams.seed = params.seed; + cparams.f16_kv = params.memory_f16; + cparams.logits_all = params.logits_all; + cparams.embedding = params.embedding; + cparams.rope_freq_base = params.rope_freq_base; + cparams.rope_freq_scale = params.rope_freq_scale; + + return cparams; } std::tuple llama_init_from_gpt_params(gpt_params & params) { - auto lparams = llama_context_params_from_gpt_params(params); + auto mparams = llama_model_params_from_gpt_params(params); - llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams); + llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); return std::make_tuple(nullptr, nullptr); } - llama_context * lctx = llama_new_context_with_model(model, lparams); + auto cparams = llama_context_params_from_gpt_params(params); + + llama_context * lctx = llama_new_context_with_model(model, cparams); if (lctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); llama_free_model(model); return std::make_tuple(nullptr, nullptr); } - if (!params.lora_adapter.empty()) { + for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { + const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]); + float lora_scale = std::get<1>(params.lora_adapter[i]); int err = llama_model_apply_lora_from_file(model, - params.lora_adapter.c_str(), - params.lora_base.empty() ? NULL : params.lora_base.c_str(), + lora_adapter.c_str(), + lora_scale, + ((i > 0) || params.lora_base.empty()) + ? NULL + : params.lora_base.c_str(), params.n_threads); if (err != 0) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); @@ -782,8 +839,9 @@ std::tuple llama_init_from_gpt_par { LOG("warming up the model with an empty run\n"); - const std::vector tmp = { llama_token_bos(lctx), llama_token_eos(lctx), }; - llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads); + std::vector tmp = { llama_token_bos(lctx), llama_token_eos(lctx), }; + llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); + llama_kv_cache_tokens_rm(lctx, -1, -1); llama_reset_timings(lctx); } @@ -795,16 +853,23 @@ std::tuple llama_init_from_gpt_par // std::vector llama_tokenize( - struct llama_context * ctx, + const struct llama_context * ctx, + const std::string & text, + bool add_bos) { + return llama_tokenize(llama_get_model(ctx), text, add_bos); +} + +std::vector llama_tokenize( + const struct llama_model * model, const std::string & text, bool add_bos) { // upper limit for the number of tokens int n_tokens = text.length() + add_bos; std::vector result(n_tokens); - n_tokens = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos); + n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos); + int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -814,10 +879,10 @@ std::vector llama_tokenize( std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); - const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size()); + const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_piece(ctx, token, result.data(), result.size()); + int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -872,7 +937,7 @@ llama_token llama_sample_token( std::vector & candidates, int idx) { const int n_ctx = llama_n_ctx(ctx); - const int n_vocab = llama_n_vocab(ctx); + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const float temp = params.temp; const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; @@ -890,7 +955,7 @@ llama_token llama_sample_token( llama_token id = 0; - float * logits = llama_get_logits(ctx) + idx * n_vocab; + float * logits = llama_get_logits_ith(ctx, idx); // Apply params.logit_bias map for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { @@ -941,11 +1006,11 @@ llama_token llama_sample_token( if (mirostat == 1) { static float mirostat_mu = 2.0f * mirostat_tau; const int mirostat_m = 100; - llama_sample_temperature(ctx, &cur_p, temp); + llama_sample_temp(ctx, &cur_p, temp); id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); } else if (mirostat == 2) { static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temperature(ctx, &cur_p, temp); + llama_sample_temp(ctx, &cur_p, temp); id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu); } else { // Temperature sampling @@ -953,7 +1018,7 @@ llama_token llama_sample_token( llama_sample_tail_free (ctx, &cur_p, tfs_z, 1); llama_sample_typical (ctx, &cur_p, typical_p, 1); llama_sample_top_p (ctx, &cur_p, top_p, 1); - llama_sample_temperature(ctx, &cur_p, temp); + llama_sample_temp(ctx, &cur_p, temp); { const int n_top = 10; @@ -1158,7 +1223,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l #endif // NDEBUG fprintf(stream, "model_desc: %s\n", model_desc); - fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(lctx)); + fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx))); #ifdef __OPTIMIZE__ fprintf(stream, "optimize: true\n"); @@ -1182,7 +1247,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); - fprintf(stream, "export: %s # default: false\n", params.export_cgraph ? "true" : "false"); fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty); dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str()); @@ -1211,9 +1275,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, " %d: %f", lb.first, lb.second); } - fprintf(stream, "lora: %s\n", params.lora_adapter.c_str()); + fprintf(stream, "lora:\n"); + for (std::tuple la : params.lora_adapter) { + if (std::get<1>(la) != 1.0f) { + continue; + } + fprintf(stream, " - %s\n", std::get<0>(la).c_str()); + } + fprintf(stream, "lora_scaled:\n"); + for (std::tuple la : params.lora_adapter) { + if (std::get<1>(la) == 1.0f) { + continue; + } + fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); + } fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); - fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false"); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false"); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat); @@ -1256,6 +1332,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed); fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); + fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); fprintf(stream, "temp: %f # default: 0.8\n", params.temp); const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES); diff --git a/common/common.h b/common/common.h index 18aea38cee28c19e960e5da00b32e550465f4105..0e2d3fa6c07d9be88dab434e444c34611eff71a3 100644 --- a/common/common.h +++ b/common/common.h @@ -36,20 +36,23 @@ int32_t get_num_physical_cores(); struct gpt_params { uint32_t seed = -1; // RNG seed int32_t n_threads = get_num_physical_cores(); + int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_draft = 16; // number of tokens to draft during speculative decoding int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) + int32_t n_parallel = 1; // number of parallel sequences to decode + int32_t n_sequences = 1; // number of sequences to decode int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t n_beams = 0; // if non-zero then use beam search of given width. - float rope_freq_base = 10000.0f; // RoPE base frequency - float rope_freq_scale = 1.0f; // RoPE frequency scaling factor + float rope_freq_base = 0.0f; // RoPE base frequency + float rope_freq_scale = 0.0f; // RoPE frequency scaling factor // sampling parameters int32_t top_k = 40; // <= 0 to use vocab size @@ -83,8 +86,8 @@ struct gpt_params { std::vector antiprompt; // string upon seeing which more user input is prompted std::string logdir = ""; // directory in which to save YAML log files - std::string lora_adapter = ""; // lora adapter path - std::string lora_base = ""; // base model path for the lora adapter + std::vector> lora_adapter; // lora adapter path with user defined scale + std::string lora_base = ""; // base model path for the lora adapter int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line @@ -93,7 +96,6 @@ struct gpt_params { bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score - bool low_vram = false; // if true, reduce VRAM usage at the cost of performance bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS bool memory_f16 = true; // use f16 instead of f32 for memory kv bool random_prompt = false; // do not randomize prompt if none provided @@ -107,16 +109,16 @@ struct gpt_params { bool interactive_first = false; // wait for user input immediately bool multiline_input = false; // reverse the usage of `\` bool simple_io = false; // improves compatibility with subprocesses and limited consoles + bool cont_batching = false; // insert new sequences for decoding on-the-fly bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool ignore_eos = false; // ignore generated EOS tokens bool instruct = false; // instruction mode (used for Alpaca models) bool penalize_nl = true; // consider newlines as a repeatable token - bool perplexity = false; // compute perplexity over the prompt + bool logits_all = false; // return logits for all tokens in the batch bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory bool numa = false; // attempt optimizations that help on some NUMA systems - bool export_cgraph = false; // export the computation graph bool verbose_prompt = false; // print prompt tokens before generation }; @@ -124,13 +126,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params); void gpt_print_usage(int argc, char ** argv, const gpt_params & params); +std::string get_system_info(const gpt_params & params); + std::string gpt_random_prompt(std::mt19937 & rng); +void process_escapes(std::string& input); + // // Model utils // std::tuple llama_init_from_gpt_params(gpt_params & params); +struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params); struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); // @@ -140,7 +147,12 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param // tokenizes a string into a vector of tokens // should work similar to Python's `tokenizer.encode` std::vector llama_tokenize( - struct llama_context * ctx, + const struct llama_context * ctx, + const std::string & text, + bool add_bos); + +std::vector llama_tokenize( + const struct llama_model * model, const std::string & text, bool add_bos); @@ -181,7 +193,7 @@ std::string llama_detokenize_bpe( // - ctx_guidance: context to use for classifier-free guidance, ignore if NULL // - grammar: grammar to use for sampling, ignore if NULL // - last_tokens: needed for repetition penalty, ignore if empty -// - idx: sample from llama_get_logits(ctx) + idx * n_vocab +// - idx: sample from llama_get_logits_ith(ctx, idx) // // returns: // - token: sampled token diff --git a/common/log.h b/common/log.h index 18f3b9761a7880cf7ff9b23a31d5f91dd945a3b2..b8953fdcadae4a744b20db9d602a82d35ea94c47 100644 --- a/common/log.h +++ b/common/log.h @@ -225,31 +225,31 @@ enum LogTriState // USE LOG() INSTEAD // #ifndef _MSC_VER - #define LOG_IMPL(str, ...) \ - { \ + #define LOG_IMPL(str, ...) \ + do { \ if (LOG_TARGET != nullptr) \ { \ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ fflush(LOG_TARGET); \ } \ - } + } while (0) #else - #define LOG_IMPL(str, ...) \ - { \ + #define LOG_IMPL(str, ...) \ + do { \ if (LOG_TARGET != nullptr) \ { \ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ fflush(LOG_TARGET); \ } \ - } + } while (0) #endif // INTERNAL, DO NOT USE // USE LOG_TEE() INSTEAD // #ifndef _MSC_VER - #define LOG_TEE_IMPL(str, ...) \ - { \ + #define LOG_TEE_IMPL(str, ...) \ + do { \ if (LOG_TARGET != nullptr) \ { \ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ @@ -260,10 +260,10 @@ enum LogTriState fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \ fflush(LOG_TEE_TARGET); \ } \ - } + } while (0) #else - #define LOG_TEE_IMPL(str, ...) \ - { \ + #define LOG_TEE_IMPL(str, ...) \ + do { \ if (LOG_TARGET != nullptr) \ { \ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ @@ -274,7 +274,7 @@ enum LogTriState fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \ fflush(LOG_TEE_TARGET); \ } \ - } + } while (0) #endif // The '\0' as a last argument, is a trick to bypass the silly @@ -435,41 +435,41 @@ inline FILE *log_handler() { return log_handler1_impl(); } inline void log_test() { log_disable(); - LOG("01 Hello World to nobody, because logs are disabled!\n") + LOG("01 Hello World to nobody, because logs are disabled!\n"); log_enable(); - LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET)) - LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n") + LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET)); + LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n"); log_set_target(stderr); - LOG("04 Hello World to stderr!\n") - LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n") + LOG("04 Hello World to stderr!\n"); + LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n"); log_set_target(LOG_DEFAULT_FILE_NAME); - LOG("06 Hello World to default log file!\n") + LOG("06 Hello World to default log file!\n"); log_set_target(stdout); - LOG("07 Hello World to stdout!\n") + LOG("07 Hello World to stdout!\n"); log_set_target(LOG_DEFAULT_FILE_NAME); - LOG("08 Hello World to default log file again!\n") + LOG("08 Hello World to default log file again!\n"); log_disable(); - LOG("09 Hello World _1_ into the void!\n") + LOG("09 Hello World _1_ into the void!\n"); log_enable(); - LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n") + LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n"); log_disable(); log_set_target("llama.anotherlog.log"); - LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n") + LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n"); log_enable(); - LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n") + LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n"); log_set_target("llama.yetanotherlog.log"); - LOG("13 Hello World this time in yet new file?\n") + LOG("13 Hello World this time in yet new file?\n"); log_set_target(log_filename_generator("llama_autonamed", "log")); - LOG("14 Hello World in log with generated filename!\n") + LOG("14 Hello World in log with generated filename!\n"); #ifdef _MSC_VER - LOG_TEE("15 Hello msvc TEE without arguments\n") - LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test") - LOG_TEELN("17 Hello msvc TEELN without arguments\n") - LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test") - LOG("19 Hello msvc LOG without arguments\n") - LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test") - LOGLN("21 Hello msvc LOGLN without arguments\n") - LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test") + LOG_TEE("15 Hello msvc TEE without arguments\n"); + LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test"); + LOG_TEELN("17 Hello msvc TEELN without arguments\n"); + LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test"); + LOG("19 Hello msvc LOG without arguments\n"); + LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test"); + LOGLN("21 Hello msvc LOGLN without arguments\n"); + LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test"); #endif } @@ -542,7 +542,7 @@ inline void log_dump_cmdline_impl(int argc, char **argv) buf << " " << argv[i]; } } - LOGLN("Cmd:%s", buf.str().c_str()) + LOGLN("Cmd:%s", buf.str().c_str()); } #define log_tostr(var) log_var_to_string_impl(var).c_str() @@ -620,10 +620,10 @@ inline std::string log_var_to_string_impl(const std::vector & var) #define LOGLN(...) // dummy stub #undef LOG_TEE -#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf +#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf #undef LOG_TEELN -#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf +#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf #undef LOG_DISABLE #define LOG_DISABLE() // dummy stub diff --git a/common/train.cpp b/common/train.cpp new file mode 100644 index 0000000000000000000000000000000000000000..35a4cf9e6cae39b16c7c991760b9ec98812d02a8 --- /dev/null +++ b/common/train.cpp @@ -0,0 +1,1496 @@ +#include "train.h" +#include "common.h" + +#include +#include +#include + +struct random_normal_distribution { + std::mt19937 gen; + std::normal_distribution rd; + float min; + float max; +}; + +struct random_uniform_distribution { + std::mt19937 gen; + std::uniform_real_distribution rd; +}; + +struct train_state * init_train_state() { + struct train_state * state = new struct train_state; + state->train_its = 0; + state->train_samples = 0; + state->train_tokens = 0; + state->train_epochs = 0; + state->shuffle_samples_hash = 0; + state->shuffle_sample_count = 0; + state->shuffle_next_sample = 0; + state->shuffle_rng_state_current = ""; + state->shuffle_rng_state_next = ""; + + state->opt = new struct ggml_opt_context; + state->opt->ctx = NULL; + state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM); + state->opt->loss_after = 0.0f; + + return state; +} + +void free_train_state(struct train_state * state) { + delete state->opt; + delete state; +} + +struct random_normal_distribution * init_random_normal_distribution( + int seed, float mean, float std, float min, float max +) { + struct random_normal_distribution * rnd = (struct random_normal_distribution *) malloc(sizeof(struct random_normal_distribution)); + rnd->gen = std::mt19937(seed); + rnd->rd = std::normal_distribution{mean, std}; + rnd->min = min; + rnd->max = max; + return rnd; +} + +struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max) { + struct random_uniform_distribution * rnd = (struct random_uniform_distribution *) malloc(sizeof(struct random_uniform_distribution)); + rnd->gen = std::mt19937(seed); + rnd->rd = std::uniform_real_distribution{min, max}; + return rnd; +} + +void free_random_normal_distribution (struct random_normal_distribution * rnd) { + free(rnd); +} + +void free_random_uniform_distribution(struct random_uniform_distribution * rnd) { + free(rnd); +} + +struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { + float scale = 1.0f; // xavier + switch (tensor->n_dims) { + case 1: + scale /= sqrtf((float) tensor->ne[0]); + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); + *dst = scale * frand_normal(rnd); + } + break; + case 2: + scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]); + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *dst = scale * frand_normal(rnd); + } + } + break; + case 3: + scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]); + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); + *dst = scale * frand_normal(rnd); + } + } + } + break; + case 4: + scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]); + for (int i3 = 0; i3 < tensor->ne[3]; i3++) { + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); + *dst = scale * frand_normal(rnd); + } + } + } + } + break; + default: + die("Unsupported tensor->n_dims"); + }; + return tensor; +} + +struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) { + switch (tensor->n_dims) { + case 1: + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); + *dst = frand_uniform(rnd); + } + break; + case 2: + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *dst = frand_uniform(rnd); + } + } + break; + case 3: + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); + *dst = frand_uniform(rnd); + } + } + } + break; + case 4: + for (int i3 = 0; i3 < tensor->ne[3]; i3++) { + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); + *dst = frand_uniform(rnd); + } + } + } + } + break; + default: + die("Unsupported tensor->n_dims"); + }; + return tensor; +} + +float frand() { + return (float)rand()/((float)(RAND_MAX) + 1.0f); +} + +float frand_normal(struct random_normal_distribution * rnd) { + return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max); +} + +float frand_uniform(struct random_uniform_distribution * rnd) { + return rnd->rd(rnd->gen); +} + +int clamp(const int v, const int min, const int max) { + return ((v < min) ? (min) : (v > max) ? (max) : v); +} + +float fclamp(const float v, const float min, const float max) { + return ((v < min) ? (min) : (v > max) ? (max) : v); +} + +void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { + GGML_ASSERT(tensor->n_dims == 1); + GGML_ASSERT(tensor->ne[0] == ne0); +} + +void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { + GGML_ASSERT(tensor->n_dims == 2); + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); +} + +void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { + GGML_ASSERT(tensor->n_dims == 3); + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); + GGML_ASSERT(tensor->ne[2] == ne2); +} + +void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { + GGML_ASSERT(tensor->n_dims == 4); + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); + GGML_ASSERT(tensor->ne[2] == ne2); + GGML_ASSERT(tensor->ne[3] == ne3); +} + +int64_t get_example_targets_batch( + struct llama_context * lctx, + struct ggml_tensor * tokens_input, + struct ggml_tensor * target_probs, + int64_t example_id, + const size_t * samples_offs, + const size_t * samples_begin, + const size_t * samples_size, + size_t samples_count, + const llama_token * train_data, + size_t n_train_data, + bool separate_with_eos, + bool separate_with_bos, + bool fill_with_next_samples, + bool sample_random_offsets +) { + GGML_ASSERT(samples_count > 0); + GGML_ASSERT(tokens_input->n_dims == 2); + GGML_ASSERT(target_probs->n_dims == 3); + int64_t n_vocab = target_probs->ne[0]; + int64_t n_tokens = tokens_input->ne[0]; + int64_t n_batch = tokens_input->ne[1]; + GGML_ASSERT(n_vocab == target_probs->ne[0]); + GGML_ASSERT(n_tokens == target_probs->ne[1]); + GGML_ASSERT(n_batch == target_probs->ne[2]); + + int64_t used_samples = 0; + + ggml_set_f32(target_probs, 0.0f); + llama_token bos = llama_token_bos(lctx); + llama_token eos = llama_token_eos(lctx); + // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples); + for (int k=0; k= sample_size && fill_with_next_samples) { + if (!sample_separation_eos) { + // insert eos token to separate samples + sample_separation_eos = true; + } else if (!sample_separation_bos) { + // insert bos token to separate samples + sample_separation_bos = true; + token = bos; + } else { + // sample separation is done, continue with next sample + sample_separation_eos = !separate_with_eos; + sample_separation_bos = !separate_with_bos; + sample_offs = 0; + sample_idx = (example_id + used_samples) % samples_count; + sample_begin = samples_begin[sample_idx]; + sample_size = samples_size[sample_idx]; + ++used_samples; + } + } + // note: no else-if here + if (sample_offs < sample_size) { + token = clamp(train_data[sample_begin+sample_offs], 0, (llama_token) (n_vocab - 1)); + ++sample_offs; + } + ggml_set_f32_nd(target_probs, token, (int) i, (int) k, 0, +1.0f); + if (i+1> rng; +} + +std::string mt19937_get_state(const std::mt19937& rng) { + std::stringstream s_rng_state; + s_rng_state.imbue(std::locale::classic()); + s_rng_state << rng; + return s_rng_state.str(); +} + +std::string mt19937_seed_to_state(unsigned seed) { + std::mt19937 rng(seed); + return mt19937_get_state(rng); +} + +std::string shuffle_samples( + const std::string & rng_state, + size_t * shuffled_offs, + size_t * shuffled_begins, + size_t * shuffled_sizes, + const size_t * begins, + const size_t * sizes, + size_t count) { + if (count == 0) return rng_state; + + std::mt19937 rng; + mt19937_set_state(rng, rng_state); + + // sort indices by random value for each index + std::vector idcs; + { + std::vector rnd; + idcs.resize(count); + rnd.resize(count); + for (unsigned i=0; i h_string; + std::hash h_ull; + size_t h = h_string(std::string(fn)); + h = hash_combine(h, h_ull((unsigned long long) sample_count)); + for (size_t i=0; i< sample_count; ++i) { + h = hash_combine(h, h_ull((unsigned long long) samples_begin[i])); + h = hash_combine(h, h_ull((unsigned long long) samples_size[i])); + } + return h; +} + +std::string replace_str(const char * s, const char * needle, const char * replacement) { + std::string str = s; + size_t pos = str.find(needle); + if (pos != std::string::npos) { + str.replace(pos, strlen(needle), replacement); + } + return str; +} + +void print_duration(double fmillis) { + if (fmillis < 1000.0f) { + printf("%.1fms", (float) fmillis); + return; + } + const int64_t one_sec = 1000; + const int64_t one_min = one_sec * 60; + const int64_t one_hour = one_min * 60; + const int64_t one_day = one_hour * 24; + + int64_t millis = (int64_t) fmillis; + int64_t days = millis/one_day; + int64_t hours = (millis - days*one_day)/one_hour; + int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min; + int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec; + + // to print int64_t either cast to (long long int) or use macro PRId64 from + if (days > 0) { + printf("%lldd ", (long long int) days); + } + printf("%02lld:%02lld:%02lld", (long long int) hours, (long long int) minutes, (long long int) seconds); +} + +float cosine_decay(int64_t step, int64_t decay_steps, float minimum) { + if (step > decay_steps) { + step = decay_steps; + } + const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps)); + const float decay = (1 - minimum)*cosine_decay + minimum; + return decay; +} + +float cosine_decay_restart(int64_t step, int64_t decay_steps, float minimum, float restart_step_mult) { + while (step > decay_steps) { + step -= decay_steps; + decay_steps = (int64_t) (restart_step_mult * decay_steps); + } + return cosine_decay(step, decay_steps, minimum); +} + +float learning_schedule( + int64_t step, + int64_t warmup_steps, + int64_t cos_decay_steps, + float learning_rate, + float overall_minimum, + float cos_decay_minimum, + float cos_decay_restart_step_mult, + bool enable_restart) { + + float result = + (step < warmup_steps) + ? (float) step / (float) warmup_steps + : enable_restart + ? cosine_decay_restart( + step - warmup_steps, + cos_decay_steps, + cos_decay_minimum, + cos_decay_restart_step_mult) + : cosine_decay( + step, + cos_decay_steps, + cos_decay_minimum); + + float min = overall_minimum / learning_rate; + result = min + result * (1.0f - min); + return result; +} + +static bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) { + GGML_ASSERT(a != NULL); + GGML_ASSERT(b != NULL); + GGML_ASSERT(a->type == b->type); + GGML_ASSERT(ggml_are_same_shape(a, b)); + GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b)); + + return true; +} + +void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) { + if (dst == NULL) { + return; + } + struct ggml_tensor * t = ggml_get_tensor(ctx, name); + GGML_ASSERT(are_same_layout(dst, t)); + memcpy(dst->data, t->data, ggml_nbytes(t)); + + if (strlen(ggml_get_name(dst)) == 0) { + ggml_set_name(dst, name); + } +} + +// gguf constants +static const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type"; +static const char * LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"; +static const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"; +static const char * LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"; +static const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"; +static const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"; +static const char * LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"; +static const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"; +static const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"; +static const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"; +static const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"; +static const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"; +static const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"; +static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"; +static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"; +static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"; +static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"; +static const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"; + +static const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"; +static const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"; +static const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"; + +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"; +static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"; + +static const char * LLM_KV_TRAINING_FILE_VERSION = "training.file_version"; +static const char * LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"; +static const char * LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"; +static const char * LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"; +static const char * LLM_KV_TRAINING_EPOCH_COUNT = "training.epoch_count"; +static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH = "training.shuffle.samples_hash"; +static const char * LLM_KV_TRAINING_SHUFFLE_RNG_STATE = "training.shuffle.rng_state"; +static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT = "training.shuffle.sample_count"; +static const char * LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE = "training.shuffle.next_sample"; + +#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ +{ \ + const std::string skey(key); \ + const int kid = gguf_find_key(ctx, skey.c_str()); \ + if (kid >= 0) { \ + enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ + if (ktype != (type)) { \ + die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \ + } \ + (dst) = func(ctx, kid); \ + } else if (req) { \ + die_fmt("key not found in model: %s", skey.c_str()); \ + } \ +} + +void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) { + // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read + + uint32_t file_version; + GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION); + GGML_ASSERT(file_version == 0); + + GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT); + GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT); + GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED); + + uint64_t nx; + GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT); + opt->nx = (size_t) nx; + + // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know + + std::string opt_type; + GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE); + if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) { + opt->params.type = GGML_OPT_ADAM; + + GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS); + GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS); + GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT); + + ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); + + copy_tensor_by_name(opt->adam.m, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); + copy_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); + copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); + } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) { + opt->params.type = GGML_OPT_LBFGS; + + GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT); + GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS); + GGUF_GET_KEY(fctx, opt->lbfgs.step, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP); + GGUF_GET_KEY(fctx, opt->lbfgs.j, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J); + GGUF_GET_KEY(fctx, opt->lbfgs.k, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K); + GGUF_GET_KEY(fctx, opt->lbfgs.end, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END); + GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT); + + ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); + + copy_tensor_by_name(opt->lbfgs.x, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); + copy_tensor_by_name(opt->lbfgs.xp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); + copy_tensor_by_name(opt->lbfgs.g, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); + copy_tensor_by_name(opt->lbfgs.gp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); + copy_tensor_by_name(opt->lbfgs.d, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); + copy_tensor_by_name(opt->lbfgs.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); + copy_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); + copy_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); + copy_tensor_by_name(opt->lbfgs.lms, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); + copy_tensor_by_name(opt->lbfgs.lmy, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); + } else { + die("unknown optimizer type\n"); + } +} + +void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) { + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past); + gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter); + gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized); + + switch (opt->params.type) { + case GGML_OPT_ADAM: + { + gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, opt->adam.fx_prev); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement); + + ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); + ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); + if (opt->adam.pf) { + ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); + } + + gguf_add_tensor(fctx, opt->adam.m); + gguf_add_tensor(fctx, opt->adam.v); + if (opt->adam.pf) { + gguf_add_tensor(fctx, opt->adam.pf); + } + } break; + case GGML_OPT_LBFGS: + { + gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, opt->lbfgs.fx_best); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, opt->lbfgs.step); + gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, opt->lbfgs.j); + gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, opt->lbfgs.k); + gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, opt->lbfgs.end); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement); + + ggml_set_name(opt->lbfgs.x, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); + ggml_set_name(opt->lbfgs.xp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); + ggml_set_name(opt->lbfgs.g, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); + ggml_set_name(opt->lbfgs.gp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); + ggml_set_name(opt->lbfgs.d, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); + if (opt->lbfgs.pf) { + ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); + } + ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); + ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); + ggml_set_name(opt->lbfgs.lms, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); + ggml_set_name(opt->lbfgs.lmy, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); + + gguf_add_tensor(fctx, opt->lbfgs.x); + gguf_add_tensor(fctx, opt->lbfgs.xp); + gguf_add_tensor(fctx, opt->lbfgs.g); + gguf_add_tensor(fctx, opt->lbfgs.gp); + gguf_add_tensor(fctx, opt->lbfgs.d); + if (opt->lbfgs.pf) { + gguf_add_tensor(fctx, opt->lbfgs.pf); + } + gguf_add_tensor(fctx, opt->lbfgs.lmal); + gguf_add_tensor(fctx, opt->lbfgs.lmys); + gguf_add_tensor(fctx, opt->lbfgs.lms); + gguf_add_tensor(fctx, opt->lbfgs.lmy); + } break; + } +} + +bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train) { + if (gguf_find_key(fctx, LLM_KV_TRAINING_FILE_VERSION) < 0) { + return false; + } + + uint32_t file_version; + GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION); + GGML_ASSERT(file_version <= 1); + + if (file_version == 0) { + + GGUF_GET_KEY(fctx, train->train_its, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT); + GGUF_GET_KEY(fctx, train->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT); + GGUF_GET_KEY(fctx, train->train_tokens, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT); + + } else if (file_version == 1) { + + GGUF_GET_KEY(fctx, train->train_its, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_ITERATION_COUNT); + GGUF_GET_KEY(fctx, train->train_samples, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_SAMPLE_COUNT); + GGUF_GET_KEY(fctx, train->train_tokens, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_TOKEN_COUNT); + GGUF_GET_KEY(fctx, train->train_epochs, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_EPOCH_COUNT); + + GGUF_GET_KEY(fctx, train->shuffle_samples_hash, gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH); + GGUF_GET_KEY(fctx, train->shuffle_rng_state_current, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_SHUFFLE_RNG_STATE); + GGUF_GET_KEY(fctx, train->shuffle_sample_count, gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT); + GGUF_GET_KEY(fctx, train->shuffle_next_sample, gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE); + } + + load_opt_context_gguf(fctx, f_ggml_ctx, train->opt); + return true; +} + +void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train) { + gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION, 1); + gguf_set_val_u64(fctx, LLM_KV_TRAINING_ITERATION_COUNT, train->train_its); + gguf_set_val_u64(fctx, LLM_KV_TRAINING_SAMPLE_COUNT, train->train_samples); + gguf_set_val_u64(fctx, LLM_KV_TRAINING_TOKEN_COUNT, train->train_tokens); + gguf_set_val_u64(fctx, LLM_KV_TRAINING_EPOCH_COUNT, train->train_epochs); + + gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH, (uint64_t) train->shuffle_samples_hash); + gguf_set_val_str(fctx, LLM_KV_TRAINING_SHUFFLE_RNG_STATE, train->shuffle_rng_state_current.c_str()); + gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT, (uint64_t) train->shuffle_sample_count); + gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE, (uint64_t) train->shuffle_next_sample); + + save_opt_context_gguf(fctx, train->opt); +} + + +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + size = 0; + } else { + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, size, 1, fp); + if (ferror(fp)) { + die_fmt("read error: %s", strerror(errno)); + } + if (ret != 1) { + die("unexpectedly reached end of file"); + } + } + + std::uint32_t read_u32() { + std::uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + std::string read_string(std::uint32_t len) { + std::vector chars(len); + read_raw(chars.data(), len); + return std::string(chars.data(), len); + } + + void write_raw(const void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, size, 1, fp); + if (ret != 1) { + die_fmt("write error: %s", strerror(errno)); + } + } + + void write_u32(std::uint32_t val) { + write_raw(&val, sizeof(val)); + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } +}; + +static size_t utf8_len(char src) { + const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; + uint8_t highbits = static_cast(src) >> 4; + return lookup[highbits]; +} + +// mark each byte with its utf8 unit number. +// returns the number of utf8 characters. +// e.g. when bytes == '\x61\xD0\xB0\x62', +// then utf8_units will become [0,0,1,0] +// utf8_nunits will become [1,2,2,1] and 3 is returned. +// bytes where utf8_units is zero, are the begin of an utf8 character. +static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nunits, size_t count) { + size_t offs = 0; + size_t count_utf8 = 0; + while(offs < count) { + int len = (int) utf8_len(bytes[offs]); + for (int i=0; i & out_tokens, + std::vector & out_samples_begin, + std::vector & out_samples_size) { + struct llama_file f(filename, "rb"); + + if (f.size == 0) { + out_tokens.clear(); + out_samples_begin.clear(); + out_samples_size.clear(); + printf("%s: warning: empty or not existing training data file '%s'\n", + __func__, filename); + return out_tokens.size(); + } + + // account for possible leading whitespace that will be added by tokenizer + // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12] + const int n_max_tokens_overhead = 1; + + std::vector buf; + buf.resize(f.size); + + f.read_raw(buf.data(), f.size); + + std::vector utf8_units; + std::vector utf8_nunits; + utf8_units.resize(buf.size()); + utf8_nunits.resize(buf.size()); + mark_utf8_units(buf.data(), utf8_units.data(), utf8_nunits.data(), buf.size()); + + if (sample_start.size() == 0) { + // tokenize all data at once + out_tokens.resize(buf.size() + n_max_tokens_overhead); + + int n_tokens = llama_tokenize( + llama_get_model(lctx), + buf.data(), + (int) buf.size(), + out_tokens.data(), + (int) out_tokens.size(), + false); + if (n_tokens < 0) { + out_tokens.resize(-n_tokens); + n_tokens = llama_tokenize( + llama_get_model(lctx), + buf.data(), + (int) buf.size(), + out_tokens.data(), + (int) out_tokens.size(), + false); + } + if (n_tokens >= 0) { + out_tokens.resize(n_tokens); + } + + // generate sample starts at all token positions + out_samples_begin.clear(); + out_samples_begin.push_back(0); + out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size())); + size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0; + for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) { + out_samples_begin.push_back(sample_begin); + out_samples_size.push_back(context_length); + } + } else { + // split data into samples and tokenize each sample + std::string data_str(buf.data(), buf.size()); + out_samples_begin.clear(); + out_samples_size.clear(); + out_tokens.clear(); + + // find all positions of pattern sample_start + size_t sample_begin = data_str.find(sample_start, 0); + while (sample_begin != std::string::npos) { + out_samples_begin.push_back(sample_begin); + const size_t search_start = sample_begin + sample_start.size(); + sample_begin = data_str.find(sample_start, search_start); + } + if (out_samples_begin.size() == 0) { + printf("%s: warning: sample start pattern '%s' not found. inserting single sample at data begin\n", + __func__, sample_start.c_str()); + out_samples_begin.push_back(0); + } + + out_samples_size.resize(out_samples_begin.size(), 0); + + std::vector buf_sample; + std::vector tok_sample; + + const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size()); + size_t found_too_big_sample = 0; + size_t found_too_small_sample = 0; + size_t found_empty_sample = 0; + size_t found_min_sample_size = SIZE_MAX; + size_t found_max_sample_size = 0; + + size_t max_token_text_size = 0; + int n_vocab = llama_n_vocab(llama_get_model(lctx)); + for (llama_token token=0; token < n_vocab; ++token) { + max_token_text_size = std::max( + max_token_text_size, + strlen(llama_token_get_text(lctx, token))); + } + + // upper bound of context byte length. + // strings with this byte length should always tokenize to at least context_length tokens. + size_t context_byte_len = max_token_text_size*context_length; + + for (unsigned i=0; i 0) { + // sample end is in the middle of an utf8 character. + // advance sample_end to the begin of the next utf8 character. + sample_end += utf8_nunits[sample_end] - utf8_units[sample_end]; + } + size_t sample_size = sample_end - sample_begin; + if (sample_size == 0) { + ++found_empty_sample; + } + + if (sample_size > 0) { + // llama_tokenize expects zero terminated string, + // copy sample into buffer and zero terminate it. + buf_sample.resize(sample_size); + memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size); + + // printf("sample: '%s'\n", buf_sample.data()); + + // tokenize the sample + tok_sample.resize(buf_sample.size() + n_max_tokens_overhead); + int n_tokens = llama_tokenize(llama_get_model(lctx), + buf_sample.data(), + (int) buf_sample.size(), + tok_sample.data(), + (int) tok_sample.size(), + false); + if (n_tokens < 0) { + tok_sample.resize(-n_tokens); + n_tokens = llama_tokenize(llama_get_model(lctx), + buf_sample.data(), + (int) buf_sample.size(), + tok_sample.data(), + (int) tok_sample.size(), + false); + GGML_ASSERT(n_tokens >= 0); + } + GGML_ASSERT(n_tokens <= (int) tok_sample.size()); + + if ((size_t) n_tokens > context_length) { + ++found_too_big_sample; + } else if ((size_t) n_tokens < context_length) { + ++found_too_small_sample; + } + found_max_sample_size = std::max(found_max_sample_size, (size_t) n_tokens); + found_min_sample_size = std::min(found_min_sample_size, (size_t) n_tokens); + + // write out tokens, start and size of sample + // overwrite the string start position with the token start position + out_samples_begin[i] = out_tokens.size(); + out_samples_size[i] = (size_t) n_tokens; + out_tokens.insert(out_tokens.end(), tok_sample.begin(), tok_sample.begin() + n_tokens); + } else { + out_samples_begin[i] = out_tokens.size(); + out_samples_size[i] = 0; + } + + } + if (found_too_big_sample > 0) { + printf("%s: warning: found %zu samples (max length %zu) that exceed context length of %u. samples will be cut off.\n", + __func__, found_too_big_sample, found_max_sample_size, context_length); + } + + if (found_too_small_sample > 0) { + printf("%s: warning: found %zu samples (min length %zu) that are shorter than context length of %u.\n", + __func__, found_too_small_sample, found_min_sample_size, context_length); + } + + if (found_empty_sample) { + printf("%s: warning: found %zu empty samples.\n", + __func__, found_empty_sample); + } + } + printf("%s: total number of samples: %zu\n", + __func__, out_samples_begin.size()); + + GGML_ASSERT(out_samples_begin.size() == out_samples_size.size()); + + return out_tokens.size(); +} + +std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration) { + std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest); + return replace_str(filename, pattern_it, sit.c_str()); +} + +struct train_params_common get_default_train_params_common() { + struct train_params_common params; + params.fn_train_data = "shakespeare.txt"; + params.fn_checkpoint_in = "checkpoint.gguf"; + params.fn_checkpoint_out = "checkpoint-ITERATION.gguf"; + params.pattern_fn_it = "ITERATION"; + params.fn_latest = "LATEST"; + + params.print_usage = false; + + params.save_every = 10; + + params.seed = -1; + + params.n_ctx = 128; + params.n_threads = 6; + params.n_batch = 8; + params.n_gradient_accumulation = 1; + params.n_epochs = -1; + + params.custom_n_ctx = false; + + params.use_flash = true; + params.use_checkpointing = true; + + params.sample_start = ""; + params.include_sample_start = false; + params.escape = false; + params.overlapping_samples = false; + params.fill_with_next_samples = false; + params.separate_with_eos = false; + params.separate_with_bos = true; + params.sample_random_offsets = false; + params.force_reshuffle = false; + + params.opt_past = 0; + params.opt_delta = 1e-5f; + params.opt_max_no_improvement = 0; + + params.warmup = 100; + params.cos_decay_steps = 1000; + params.cos_decay_restart = 1.1f; + params.cos_decay_min = 0.1f; + params.enable_restart = false; + + params.adam_n_iter = 256; + params.adam_alpha = 1e-3f; + params.adam_min_alpha = 0; + params.adam_decay = 1e-1f; + params.adam_decay_min_ndim = 2; + params.adam_beta1 = 0.9f; + params.adam_beta2 = 0.999f; + params.adam_gclip = 1.0f; + params.adam_eps_f = 0.0f; + return params; +} + +void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train_params_common * params) { + // fprintf(stderr, "usage: %s [options]\n", argv[0]); + // fprintf(stderr, "\n"); + // fprintf(stderr, "options:\n"); + // fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " --train-data FNAME path from which to load training data (default '%s')\n", params->fn_train_data); + fprintf(stderr, " --checkpoint-in FNAME path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in); + fprintf(stderr, " --checkpoint-out FNAME path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out); + fprintf(stderr, " --pattern-fn-it STR pattern in output filenames to be replaced by iteration number (default '%s')\n", params->pattern_fn_it); + fprintf(stderr, " --fn-latest STR string to use instead of iteration number for saving latest output (default '%s')\n", params->fn_latest); + fprintf(stderr, " --save-every N save checkpoint and lora every N iterations. Disabled when N <= 0. (default '%d')\n", params->save_every); + fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for -1)\n"); + fprintf(stderr, " -c N, --ctx N Context size used during training (default %d)\n", params->n_ctx); + fprintf(stderr, " -t N, --threads N Number of threads (default %d)\n", params->n_threads); + fprintf(stderr, " -b N, --batch N Parallel batch size (default %d)\n", params->n_batch); + fprintf(stderr, " --grad-acc N Number of gradient accumulation steps (simulates larger batch size of batch*gradacc) (default %d)\n", params->n_gradient_accumulation); + fprintf(stderr, " --sample-start STR Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str()); + fprintf(stderr, " --include-sample-start Include the sample start in the samples. (default off)\n"); + fprintf(stderr, " --escape process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); + fprintf(stderr, " --overlapping-samples Samples my overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n"); + fprintf(stderr, " --fill-with-next-samples Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n"); + fprintf(stderr, " --separate-with-eos When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : ""); + fprintf(stderr, " --separate-with-bos When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : ""); + fprintf(stderr, " --no-separate-with-eos When fill-with-next-samples, don't insert end-of-sequence token between samples.%s\n", !params->separate_with_eos ? " (default)" : ""); + fprintf(stderr, " --no-separate-with-bos When fill-with-next-samples, don't insert begin-of-sequence token between samples.%s\n", !params->separate_with_bos ? " (default)" : ""); + fprintf(stderr, " --sample-random-offsets Use samples beginning at random offsets. Together with fill-with-next-samples this may help for training endless text generation.%s\n", params->sample_random_offsets ? " (default)" : ""); + fprintf(stderr, " --force-reshuffle Force a reshuffling of data at program start, otherwise the shuffling of loaded checkpoint is resumed.\n"); + fprintf(stderr, " --no-flash Don't use flash attention \n"); + fprintf(stderr, " --use-flash Use flash attention (default)\n"); + fprintf(stderr, " --no-checkpointing Don't use gradient checkpointing\n"); + fprintf(stderr, " --use-checkpointing Use gradient checkpointing (default)\n"); + fprintf(stderr, " --warmup N Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup); + fprintf(stderr, " --cos-decay-steps N Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps); + fprintf(stderr, " --cos-decay-restart N Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart); + fprintf(stderr, " --cos-decay-min N Only for Adam optimizer. Cosine decay minimum (default %f)\n", params->cos_decay_min); + fprintf(stderr, " --enable-restart N Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : ""); + fprintf(stderr, " --disable-restart N Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : ""); + fprintf(stderr, " --opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past); + fprintf(stderr, " --opt-delta N Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta); + fprintf(stderr, " --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement); + fprintf(stderr, " --epochs N Maximum number epochs to process. (default %d)\n", params->n_epochs); + fprintf(stderr, " --adam-iter N Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter); + fprintf(stderr, " --adam-alpha N Adam learning rate alpha (default %f)\n", params->adam_alpha); + fprintf(stderr, " --adam-min-alpha N Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha); + fprintf(stderr, " --adam-decay N AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay); + fprintf(stderr, " --adam-decay-min-ndim N Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim); + fprintf(stderr, " --adam-beta1 N AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1); + fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2); + fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip); + fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f); + fprintf(stderr, "\n"); +} + +bool consume_common_train_arg( + int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param +) { + int& i = *idx; + std::string arg = argv[i]; + const std::string arg_prefix = "--"; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + if (arg == "--train-data") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->fn_train_data = argv[i]; + } else if (arg == "--checkpoint-in") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->fn_checkpoint_in = argv[i]; + } else if (arg == "--checkpoint-out") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->fn_checkpoint_out = argv[i]; + } else if (arg == "--pattern-fn-it") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->pattern_fn_it = argv[i]; + } else if (arg == "--fn-latest") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->fn_latest = argv[i]; + } else if (arg == "--save-every") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->save_every = std::stoi(argv[i]); + } else if (arg == "-s" || arg == "--seed") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->seed = std::stoi(argv[i]); + } else if (arg == "-c" || arg == "--ctx") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->n_ctx = std::stoi(argv[i]); + params->custom_n_ctx = true; + } else if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->n_threads = std::stoi(argv[i]); + } else if (arg == "-b" || arg == "--batch") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->n_batch = std::stoi(argv[i]); + } else if (arg == "--grad-acc") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->n_gradient_accumulation = std::max(1, std::stoi(argv[i])); + } else if (arg == "--sample-start") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->sample_start = std::string(argv[i]); + } else if (arg == "--escape") { + params->escape = true; + } else if (arg == "--include-sample-start") { + params->include_sample_start = true; + } else if (arg == "--overlapping-samples") { + params->overlapping_samples = true; + } else if (arg == "--fill-with-next-samples") { + params->fill_with_next_samples = true; + } else if (arg == "--separate-with-eos") { + params->separate_with_eos = true; + } else if (arg == "--separate-with-bos") { + params->separate_with_bos = true; + } else if (arg == "--no-separate-with-eos") { + params->separate_with_eos = false; + } else if (arg == "--no-separate-with-bos") { + params->separate_with_bos = false; + } else if (arg == "--sample-random-offsets") { + params->sample_random_offsets = true; + } else if (arg == "--force-reshuffle") { + params->force_reshuffle = true; + } else if (arg == "--no-flash") { + params->use_flash = false; + } else if (arg == "--use-flash") { + params->use_flash = true; + } else if (arg == "--no-checkpointing") { + params->use_checkpointing = false; + } else if (arg == "--use-checkpointing") { + params->use_checkpointing = true; + } else if (arg == "--warmup") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->warmup = std::stoi(argv[i]); + } else if (arg == "--cos-decay-steps") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->cos_decay_steps = std::stoi(argv[i]); + } else if (arg == "--cos-decay-restart") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->cos_decay_restart = std::stof(argv[i]); + } else if (arg == "--cos-decay-min") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->cos_decay_min = std::stof(argv[i]); + } else if (arg == "--enable-restart") { + params->enable_restart = true; + } else if (arg == "--disable-restart") { + params->enable_restart = false; + } else if (arg == "--opt-past") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->opt_past = std::stoi(argv[i]); + } else if (arg == "--opt-delta") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->opt_delta = std::stof(argv[i]); + } else if (arg == "--opt-max-no-improvement") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->opt_max_no_improvement = std::stoi(argv[i]); + } else if (arg == "--adam-epsf") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_eps_f = std::stof(argv[i]); + } else if (arg == "--epochs") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->n_epochs = std::stoi(argv[i]); + } else if (arg == "--adam-iter") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_n_iter = std::stoi(argv[i]); + } else if (arg == "--adam-alpha") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_alpha = std::stof(argv[i]); + } else if (arg == "--adam-min-alpha") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_min_alpha = std::stof(argv[i]); + } else if (arg == "--adam-decay") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_decay = std::stof(argv[i]); + } else if (arg == "--adam-decay-min-ndim") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_decay_min_ndim = std::stoi(argv[i]); + } else if (arg == "--adam-beta1") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_beta1 = std::stof(argv[i]); + } else if (arg == "--adam-beta2") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_beta2 = std::stof(argv[i]); + } else if (arg == "--adam-gclip") { + if (++i >= argc) { + *invalid_param = true; + return true; + } + params->adam_gclip = std::stof(argv[i]); + } else if (arg == "-h" || arg == "--help") { + params->print_usage = true; + return true; + } else { + return false; + } + return true; +} + +void finish_processing_train_args(struct train_params_common * params) { + if (params->escape) { + process_escapes(params->sample_start); + } +} + +void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel) { + struct train_opt_callback_data * data = (struct train_opt_callback_data *) vdata; + struct train_params_common * params = data->params; + struct train_state * train = data->train; + struct ggml_opt_context * opt = train->opt; + int n_batch = params->n_batch; + int n_ctx = params->n_ctx; + + if (accum_step == 0) { + // time measurement + int64_t now = ggml_time_ms(); + if (now > data->last_time && opt->iter > data->first_iter) { + double dt = (double) (now - data->last_time); + if (data->millis_per_iter == 0.0) { + data->millis_per_iter = dt; + } else { + const double gain = 0.7; + data->millis_per_iter = data->millis_per_iter*(1.0-gain) + dt*gain; + } + } + + double remaining_millis = 0.0; + if (data->millis_per_iter > 0.0) { + const int n_iter = params->adam_n_iter; + const int done_iter = opt->iter - data->first_iter; + const int remaining_iter = n_iter - done_iter; + remaining_millis = remaining_iter * data->millis_per_iter; + } + + // file saving + const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every); + if (save_now) { + int new_iters = opt->iter - data->last_save_iter; + train->train_its += new_iters; + train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx; + + if (data->save_cb) { + data->save_cb(data->save_data, train); + } + + data->last_save_iter = opt->iter; + } + + // exclude file saving from time measurement, by measuring last_time after saving + data->last_time = ggml_time_ms(); + + *sched = learning_schedule( + opt->iter, + params->warmup, + params->cos_decay_steps, + params->adam_alpha, + params->adam_min_alpha, + params->cos_decay_min, + params->cos_decay_restart, + params->enable_restart); + + int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f); + if (impr_plot > 0) impr_plot = 0; + if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0; + printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f", + __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count, + *sched, opt->loss_after); + + + if (data->millis_per_iter > 0) { + printf(" dt="); + print_duration(data->millis_per_iter); + printf(" eta="); + print_duration(remaining_millis); + } + + float improvement = opt->loss_before - opt->loss_after; + const float plot_scale = 10.0f; + int bar_len = (int)(1 + improvement*plot_scale + 0.5); + printf(" |"); + for (int i=0; i"); + printf("\n"); + } + + int64_t used_samples = get_example_targets_batch( + data->lctx, + data->tokens_input, + data->target_probs, + train->shuffle_next_sample, + data->shuffled_samples_offs, + data->shuffled_samples_begin, + data->shuffled_samples_size, + data->samples_count, + data->tokens_data, + data->tokens_size, + params->separate_with_eos, + params->separate_with_bos, + params->fill_with_next_samples, + params->sample_random_offsets); + + train->train_samples += used_samples; + train->shuffle_next_sample += used_samples; + + if (train->shuffle_next_sample >= train->shuffle_sample_count) { + ++train->train_epochs; + printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) train->train_epochs); + // note: we may have used some samples from the current shuffling more than once + train->shuffle_rng_state_current = train->shuffle_rng_state_next; + train->shuffle_rng_state_next = shuffle_samples( + train->shuffle_rng_state_current, + data->shuffled_samples_offs, + data->shuffled_samples_begin, + data->shuffled_samples_size, + data->samples_begin, + data->samples_size, + data->samples_count); + train->shuffle_next_sample = 0; + } + + const bool last_epoch_reached = (params->n_epochs > 0 && (int64_t) train->train_epochs - data->first_epoch >= params->n_epochs); + if (last_epoch_reached) { + // allow optimization iteration at last epoch to be completed before canceling + if (data->iter_at_last_epoch < 0) { + data->iter_at_last_epoch = opt->iter; + } else if (opt->iter > data->iter_at_last_epoch) { + *cancel = true; + } + } +} diff --git a/common/train.h b/common/train.h new file mode 100644 index 0000000000000000000000000000000000000000..42fa704b897ae56051aaaef16a71c77fe2fd6c17 --- /dev/null +++ b/common/train.h @@ -0,0 +1,230 @@ +// Various helper functions and utilities for training + +#pragma once + +#include +#include +#include + +#include "ggml.h" +#include "llama.h" + +typedef std::string mt19937_state; + +struct train_state { + struct ggml_opt_context * opt; + + uint64_t train_its; + uint64_t train_samples; + uint64_t train_tokens; + uint64_t train_epochs; + + size_t shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes) + mt19937_state shuffle_rng_state_current; + mt19937_state shuffle_rng_state_next; + size_t shuffle_sample_count; + size_t shuffle_next_sample; +}; + +struct train_params_common { + const char * fn_train_data; + const char * fn_checkpoint_in; + const char * fn_checkpoint_out; + const char * pattern_fn_it; + const char * fn_latest; + + bool print_usage; + + int save_every; + + uint32_t seed; + + int n_ctx; + int n_threads; + int n_batch; + int n_gradient_accumulation; + int n_epochs; + + bool custom_n_ctx; + + bool use_flash; + bool use_checkpointing; + + std::string sample_start; + bool include_sample_start; + bool escape; + bool overlapping_samples; + bool fill_with_next_samples; + bool separate_with_eos; + bool separate_with_bos; + bool sample_random_offsets; + + bool force_reshuffle; + + int warmup; + int cos_decay_steps; + float cos_decay_restart; + float cos_decay_min; + bool enable_restart; + + int opt_past; + float opt_delta; + int opt_max_no_improvement; + + int adam_n_iter; + float adam_alpha; + float adam_min_alpha; + float adam_decay; + int adam_decay_min_ndim; + float adam_beta1; + float adam_beta2; + float adam_gclip; + float adam_eps_f; +}; + +typedef void (*save_train_files_callback)(void * data, struct train_state * train); + +struct train_opt_callback_data { + struct train_params_common * params; + struct train_state * train; + save_train_files_callback save_cb; + void * save_data; + struct llama_context * lctx; + int last_save_iter; + llama_token * tokens_data; + size_t tokens_size; + size_t * samples_begin; + size_t * samples_size; + size_t * shuffled_samples_offs; + size_t * shuffled_samples_begin; + size_t * shuffled_samples_size; + size_t samples_count; + struct ggml_tensor * tokens_input; + struct ggml_tensor * target_probs; + int first_iter; + int first_epoch; + int iter_at_last_epoch; + int64_t last_time; + double millis_per_iter; +}; + +struct train_state * init_train_state(); +void free_train_state(struct train_state * state); + +struct train_params_common get_default_train_params_common(); +void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params); + +bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param); +void finish_processing_train_args(struct train_params_common * params); + +struct random_normal_distribution; +struct random_uniform_distribution; + +struct random_normal_distribution * init_random_normal_distribution (int seed, float mean, float std, float min, float max); +struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max); + +void free_random_normal_distribution (struct random_normal_distribution * rnd); +void free_random_uniform_distribution(struct random_uniform_distribution * rnd); + +struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd); +struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd); + +// generate random float in interval [0,1) +float frand(); +float frand_normal (struct random_normal_distribution * rnd); +float frand_uniform(struct random_uniform_distribution * rnd); + +int clamp (const int v, const int min, const int max); +float fclamp(const float v, const float min, const float max); + +void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0); +void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1); +void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2); +void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3); + +size_t tokenize_file( + struct llama_context * lctx, + const char * filename, + const std::string & sample_start, + bool include_sample_start, + bool overlapping_samples, + unsigned context_length, + std::vector & out_tokens, + std::vector & out_samples_begin, + std::vector & out_samples_size); + +int64_t get_example_targets_batch( + struct llama_context * lctx, + struct ggml_tensor * tokens_input, + struct ggml_tensor * target_probs, + int64_t example_id, + const size_t * samples_offs, + const size_t * samples_begin, + const size_t * samples_size, + size_t samples_count, + const llama_token * train_data, + size_t n_train_data, + bool separate_with_eos, + bool separate_with_bos, + bool fill_with_next_samples, + bool sample_random_offsets); + + +void mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state); +mt19937_state mt19937_get_state(const std::mt19937& rng); +mt19937_state mt19937_seed_to_state(unsigned seed); + +mt19937_state shuffle_samples( + const mt19937_state & rng_state, + size_t * shuffled_offs, + size_t * shuffled_begins, + size_t * shuffled_sizes, + const size_t * begins, + const size_t * sizes, + size_t count); + +size_t hash_combine(size_t h1, size_t h2); + +size_t compute_samples_hash( + const char* fn, + const size_t* samples_begin, + const size_t* samples_size, + size_t sample_count); + + +std::string replace_str(const char * s, const char * needle, const char * replacement); + +void print_duration(double milliseconds); + +float cosine_decay( + int64_t step, + int64_t decay_steps, + float minimum); + +float cosine_decay_restart( + int64_t step, + int64_t decay_steps, + float minimum, + float restart_step_mult); + +float learning_schedule( + int64_t step, + int64_t warmup_steps, + int64_t decay_steps, + float learning_rate, + float overall_minimum, + float cos_decay_minimum, + float cos_decay_restart_step_mult, + bool enable_restart); + +void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name); + +void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt); +void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt); + +bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train); +void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train); + +std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration); + +void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel); diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py index 88338d823b03585275be7c9f36ba9b4c7e32abd5..958358563ccdcfca15691c38c4e04aa1db45c6b8 100644 --- a/convert-falcon-hf-to-gguf.py +++ b/convert-falcon-hf-to-gguf.py @@ -133,8 +133,6 @@ gguf_writer.add_file_type(ftype) print("gguf: get tokenizer metadata") tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] tokenizer_json_file = dir_model / 'tokenizer.json' if not tokenizer_json_file.is_file(): @@ -177,12 +175,8 @@ for i in range(vocab_size): text = bytearray(pad_token) tokens.append(text) - scores.append(0.0) # dymmy - toktypes.append(gguf.TokenType.NORMAL) # dummy gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) special_vocab.add_to_gguf(gguf_writer) diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py index 331e84e985a2f1fd2e11423101777364962123de..48e88a777fea1db20a1f42633203d326578de16e 100644 --- a/convert-starcoder-hf-to-gguf.py +++ b/convert-starcoder-hf-to-gguf.py @@ -117,8 +117,6 @@ gguf_writer.add_file_type(ftype) print("gguf: get tokenizer metadata") tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] tokenizer_json_file = dir_model / 'tokenizer.json' if not tokenizer_json_file.is_file(): @@ -161,12 +159,8 @@ for i in range(vocab_size): text = bytearray(pad_token) tokens.append(text) - scores.append(0.0) # dymmy - toktypes.append(gguf.TokenType.NORMAL) # dummy gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) special_vocab.add_to_gguf(gguf_writer) diff --git a/convert.py b/convert.py index 4ac5030db61eb2ce1cdc99242f4a9100f43bb768..8bb6c7e4108523b2983d693cdd0e7949f4fafcd9 100644 --- a/convert.py +++ b/convert.py @@ -439,7 +439,7 @@ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab' def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) if n_head_kv is not None and n_head != n_head_kv: - n_head //= n_head_kv + n_head = n_head_kv return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) .swapaxes(1, 2) .reshape(weights.shape)) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 884c4276422ebf44db08737d210aaf599296d9f2..de4cf7a6917686893d9bcd76eea83d52e15f1669 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -21,9 +21,12 @@ else() add_subdirectory(benchmark) add_subdirectory(baby-llama) add_subdirectory(train-text-from-scratch) + add_subdirectory(finetune) add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(simple) + add_subdirectory(batched) add_subdirectory(speculative) + add_subdirectory(parallel) add_subdirectory(embd-input) add_subdirectory(llama-bench) add_subdirectory(beam-search) @@ -33,4 +36,5 @@ else() if (LLAMA_BUILD_SERVER) add_subdirectory(server) endif() + add_subdirectory(export-lora) endif() diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index ed61125eaa4da4fd6340b52235ae968b7fb2e3e9..8155101d0ab936d8dd6b0a581626523305cf279a 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1,8 +1,12 @@ #include "ggml.h" +#include "train.h" + #include #include -#include +#include #include +#include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -14,31 +18,6 @@ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; constexpr float rms_norm_eps = 5e-6f; #endif -static float frand() { - return (float)rand()/(float)RAND_MAX; -} - -struct random_normal_distribution { - std::mt19937 gen; - std::normal_distribution nd; - float min; - float max; -}; - -static void init_random_normal_distribution( - struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max -) { - rnd->gen = std::mt19937(seed); - rnd->nd = std::normal_distribution{mean, std}; - rnd->min = min; - rnd->max = max; -} - -static float frand_normal(struct random_normal_distribution * rnd) { - const float r = rnd->nd(rnd->gen); - return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r); -} - static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); @@ -88,55 +67,7 @@ static struct ggml_tensor * randomize_tensor( break; default: assert(false); - }; - - return tensor; -} - -static struct ggml_tensor * randomize_tensor_normal( - struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd -) { - float scale = 1.0; // xavier - switch (ndims) { - case 1: - scale /= sqrtf(ne[0]); - for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i0] = scale * frand_normal(rnd); - } - break; - case 2: - scale /= sqrtf(ne[0]+ne[1]); - for (int i1 = 0; i1 < ne[1]; i1++) { - for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd); - } - } - break; - case 3: - scale /= sqrtf(ne[0]+ne[1]); - for (int i2 = 0; i2 < ne[2]; i2++) { - for (int i1 = 0; i1 < ne[1]; i1++) { - for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd); - } - } - } - break; - case 4: - scale /= sqrtf(ne[0]+ne[1]); - for (int i3 = 0; i3 < ne[3]; i3++) { - for (int i2 = 0; i2 < ne[2]; i2++) { - for (int i1 = 0; i1 < ne[1]; i1++) { - for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd); - } - } - } - } - break; - default: - assert(false); - }; + } return tensor; } @@ -398,27 +329,29 @@ static void randomize_model(struct llama_model * model, int seed, float mean, fl const uint32_t n_layer = hparams.n_layer; - struct random_normal_distribution rnd; - init_random_normal_distribution(&rnd, seed, mean, std, min, max); - randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd); - randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd); - randomize_tensor_normal(model->output, model->output->n_dims, model->output->ne, &rnd); + struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); + + randomize_tensor_normal(model->tok_embeddings , rnd); + randomize_tensor_normal(model->norm , rnd); + randomize_tensor_normal(model->output , rnd); for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = model->layers[i]; - randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd); + randomize_tensor_normal(layer.attention_norm, rnd); - randomize_tensor_normal(layer.wq, layer.wq->n_dims, layer.wq->ne, &rnd); - randomize_tensor_normal(layer.wk, layer.wk->n_dims, layer.wk->ne, &rnd); - randomize_tensor_normal(layer.wv, layer.wv->n_dims, layer.wv->ne, &rnd); - randomize_tensor_normal(layer.wo, layer.wo->n_dims, layer.wo->ne, &rnd); + randomize_tensor_normal(layer.wq, rnd); + randomize_tensor_normal(layer.wk, rnd); + randomize_tensor_normal(layer.wv, rnd); + randomize_tensor_normal(layer.wo, rnd); - randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd); + randomize_tensor_normal(layer.ffn_norm, rnd); - randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd); - randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd); - randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd); + randomize_tensor_normal(layer.w1, rnd); + randomize_tensor_normal(layer.w2, rnd); + randomize_tensor_normal(layer.w3, rnd); } + + free_random_normal_distribution(rnd); } @@ -429,35 +362,37 @@ static void randomize_model_lora( const uint32_t n_layer = hparams.n_layer; - struct random_normal_distribution rnd; - init_random_normal_distribution(&rnd, seed, mean, std, min, max); - randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd); - randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd); - randomize_tensor_normal(model->outputa, model->outputa->n_dims, model->outputa->ne, &rnd); - randomize_tensor_normal(model->outputb, model->outputb->n_dims, model->outputb->ne, &rnd); + struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); + + randomize_tensor_normal(model->tok_embeddings, rnd); + randomize_tensor_normal(model->norm , rnd); + randomize_tensor_normal(model->outputa , rnd); + randomize_tensor_normal(model->outputb , rnd); for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = model->layers[i]; - randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd); - - randomize_tensor_normal(layer.wqa, layer.wqa->n_dims, layer.wqa->ne, &rnd); - randomize_tensor_normal(layer.wqb, layer.wqb->n_dims, layer.wqb->ne, &rnd); - randomize_tensor_normal(layer.wka, layer.wka->n_dims, layer.wka->ne, &rnd); - randomize_tensor_normal(layer.wkb, layer.wkb->n_dims, layer.wkb->ne, &rnd); - randomize_tensor_normal(layer.wva, layer.wva->n_dims, layer.wva->ne, &rnd); - randomize_tensor_normal(layer.wvb, layer.wvb->n_dims, layer.wvb->ne, &rnd); - randomize_tensor_normal(layer.woa, layer.woa->n_dims, layer.woa->ne, &rnd); - randomize_tensor_normal(layer.wob, layer.wob->n_dims, layer.wob->ne, &rnd); - - randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd); - - randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd); - randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd); - randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd); + randomize_tensor_normal(layer.attention_norm, rnd); + + randomize_tensor_normal(layer.wqa, rnd); + randomize_tensor_normal(layer.wqb, rnd); + randomize_tensor_normal(layer.wka, rnd); + randomize_tensor_normal(layer.wkb, rnd); + randomize_tensor_normal(layer.wva, rnd); + randomize_tensor_normal(layer.wvb, rnd); + randomize_tensor_normal(layer.woa, rnd); + randomize_tensor_normal(layer.wob, rnd); + + randomize_tensor_normal(layer.ffn_norm, rnd); + + randomize_tensor_normal(layer.w1, rnd); + randomize_tensor_normal(layer.w2, rnd); + randomize_tensor_normal(layer.w3, rnd); } + + free_random_normal_distribution(rnd); } -static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { +static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { const auto & hparams = model->hparams; const uint32_t n_ctx = hparams.n_ctx; @@ -483,14 +418,12 @@ static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * mod if (!cache->ctx) { fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); - return false; + exit(1); } } cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); - - return true; } static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { @@ -554,6 +487,14 @@ static struct ggml_tensor * forward( struct ggml_tensor * kc = kv_self.k; struct ggml_tensor * vc = kv_self.v; + struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + { + int * data = (int *) KQ_pos->data; + for (int i = 0; i < N; ++i) { + data[i] = n_past + i; + } + } + // inpL shape [n_embd,N,1,1] struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); for (int il = 0; il < n_layer; ++il) { @@ -581,8 +522,8 @@ static struct ggml_tensor * forward( // wk shape [n_embd, n_embd, 1, 1] // Qcur shape [n_embd/n_head, n_head, N, 1] // Kcur shape [n_embd/n_head, n_head, N, 1] - struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); - struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); + struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0); + struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0); // store key and value to memory { @@ -754,32 +695,6 @@ static struct ggml_tensor * forward( return inpL; } -static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { - GGML_ASSERT(tensor->n_dims == 1); - GGML_ASSERT(tensor->ne[0] == ne0); -} - -static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { - GGML_ASSERT(tensor->n_dims == 2); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); -} - -static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { - GGML_ASSERT(tensor->n_dims == 3); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); - GGML_ASSERT(tensor->ne[2] == ne2); -} - -static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { - GGML_ASSERT(tensor->n_dims == 4); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); - GGML_ASSERT(tensor->ne[2] == ne2); - GGML_ASSERT(tensor->ne[3] == ne3); -} - static struct ggml_tensor * forward_batch( struct llama_model * model, struct llama_kv_cache * cache, @@ -808,9 +723,18 @@ static struct ggml_tensor * forward_batch( struct ggml_tensor * kc = kv_self.k; struct ggml_tensor * vc = kv_self.v; + struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + { + int * data = (int *) KQ_pos->data; + for (int i = 0; i < N; ++i) { + data[i] = n_past + i; + } + } + // inpL shape [n_embd,N*n_batch,1] struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); assert_shape_2d(inpL, n_embd, N*n_batch); + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -838,8 +762,8 @@ static struct ggml_tensor * forward_batch( // wk shape [n_embd, n_embd, 1, 1] // Qcur shape [n_embd/n_head, n_head, N, n_batch] // Kcur shape [n_embd/n_head, n_head, N, n_batch] - struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); - struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); + struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0); + struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0); assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); @@ -1097,6 +1021,14 @@ static struct ggml_tensor * forward_lora( struct ggml_tensor * kc = kv_self.k; struct ggml_tensor * vc = kv_self.v; + struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + { + int * data = (int *) KQ_pos->data; + for (int i = 0; i < N; ++i) { + data[i] = n_past + i; + } + } + // inpL shape [n_embd,N,1,1] struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); for (int il = 0; il < n_layer; ++il) { @@ -1130,7 +1062,7 @@ static struct ggml_tensor * forward_lora( model->layers[il].wqb, cur)), n_embd/n_head, n_head, N), - n_past, n_rot, 0, 0); + KQ_pos, n_rot, 0, 0); struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, @@ -1139,7 +1071,7 @@ static struct ggml_tensor * forward_lora( model->layers[il].wkb, cur)), n_embd/n_head, n_head, N), - n_past, n_rot, 0, 0); + KQ_pos, n_rot, 0, 0); // store key and value to memory { diff --git a/examples/batched/CMakeLists.txt b/examples/batched/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..6aa178d4d5911fb731b90218b2c2d9032dffe451 --- /dev/null +++ b/examples/batched/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET batched) +add_executable(${TARGET} batched.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/batched/README.md b/examples/batched/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5d730331769fb1a950ce94fc8e10c0b40ba36576 --- /dev/null +++ b/examples/batched/README.md @@ -0,0 +1,44 @@ +# llama.cpp/example/batched + +The example demonstrates batched generation from a given prompt + +```bash +./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4 + +... + +main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113 + + Hello my name is + +main: generating 4 sequences ... + +main: stream 0 finished +main: stream 1 finished +main: stream 2 finished +main: stream 3 finished + +sequence 0: + +Hello my name is Shirley. I am a 25-year-old female who has been working for over 5 years as a b + +sequence 1: + +Hello my name is Renee and I'm a 32 year old female from the United States. I'm looking for a man between + +sequence 2: + +Hello my name is Diana. I am looking for a housekeeping job. I have experience with children and have my own transportation. I am + +sequence 3: + +Hello my name is Cody. I am a 3 year old neutered male. I am a very friendly cat. I am very playful and + +main: decoded 108 tokens in 3.57 s, speed: 30.26 t/s + +llama_print_timings: load time = 587.00 ms +llama_print_timings: sample time = 2.56 ms / 112 runs ( 0.02 ms per token, 43664.72 tokens per second) +llama_print_timings: prompt eval time = 4089.11 ms / 118 tokens ( 34.65 ms per token, 28.86 tokens per second) +llama_print_timings: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_print_timings: total time = 4156.04 ms +``` diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp new file mode 100644 index 0000000000000000000000000000000000000000..688ef221335a98f63117855c5844d39a4f5591cf --- /dev/null +++ b/examples/batched/batched.cpp @@ -0,0 +1,255 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include + +int main(int argc, char ** argv) { + gpt_params params; + + if (argc == 1 || argv[1][0] == '-') { + printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]); + return 1 ; + } + + int n_parallel = 1; + + if (argc >= 2) { + params.model = argv[1]; + } + + if (argc >= 3) { + params.prompt = argv[2]; + } + + if (argc >= 4) { + n_parallel = std::atoi(argv[3]); + } + + if (params.prompt.empty()) { + params.prompt = "Hello my name is"; + } + + // total length of the sequences including the prompt + const int n_len = 32; + + // init LLM + + llama_backend_init(params.numa); + + // initialize the model + + llama_model_params model_params = llama_model_default_params(); + + // model_params.n_gpu_layers = 99; // offload all layers to the GPU + + llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); + + if (model == NULL) { + fprintf(stderr , "%s: error: unable to load model\n" , __func__); + return 1; + } + + // tokenize the prompt + + std::vector tokens_list; + tokens_list = ::llama_tokenize(model, params.prompt, true); + const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel; + + // initialize the context + + llama_context_params ctx_params = llama_context_default_params(); + + ctx_params.seed = 1234; + ctx_params.n_ctx = n_kv_req; + ctx_params.n_batch = std::max(n_len, n_parallel); + ctx_params.n_threads = params.n_threads; + ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + + llama_context * ctx = llama_new_context_with_model(model, ctx_params); + + if (ctx == NULL) { + fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + return 1; + } + + const int n_ctx = llama_n_ctx(ctx); + + LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); + + // make sure the KV cache is big enough to hold all the prompt and generated tokens + if (n_kv_req > n_ctx) { + LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req); + LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__); + return 1; + } + + // print the prompt token-by-token + + fprintf(stderr, "\n"); + + for (auto id : tokens_list) { + fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + } + + fflush(stderr); + + // create a llama_batch with size 512 + // we use this object to submit token data for decoding + + llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0); + + // evaluate the initial prompt + batch.n_tokens = tokens_list.size(); + + for (int32_t i = 0; i < batch.n_tokens; i++) { + batch.token[i] = tokens_list[i]; + batch.pos[i] = i; + batch.seq_id[i] = 0; + batch.logits[i] = false; + } + + // llama_decode will output logits only for the last token of the prompt + batch.logits[batch.n_tokens - 1] = true; + + if (llama_decode(ctx, batch) != 0) { + LOG_TEE("%s: llama_decode() failed\n", __func__); + return 1; + } + + // assign the system KV cache to all parallel sequences + // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them + for (int32_t i = 1; i < n_parallel; ++i) { + llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens); + } + + if (n_parallel > 1) { + LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel); + } + + // main loop + + // we will store the parallel decoded sequences in this vector + std::vector streams(n_parallel); + + // remember the batch index of the last token for each parallel sequence + // we need this to determine which logits to sample from + std::vector i_batch(n_parallel, batch.n_tokens - 1); + + int n_cur = batch.n_tokens; + int n_decode = 0; + + const auto t_main_start = ggml_time_us(); + + while (n_cur <= n_len) { + // prepare the next batch + batch.n_tokens = 0; + + // sample the next token for each parallel sequence / stream + for (int32_t i = 0; i < n_parallel; ++i) { + if (i_batch[i] < 0) { + // the stream has already finished + continue; + } + + auto n_vocab = llama_n_vocab(model); + auto * logits = llama_get_logits_ith(ctx, i_batch[i]); + + std::vector candidates; + candidates.reserve(n_vocab); + + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + const int top_k = 40; + const float top_p = 0.9f; + const float temp = 0.4f; + + llama_sample_top_k(ctx, &candidates_p, top_k, 1); + llama_sample_top_p(ctx, &candidates_p, top_p, 1); + llama_sample_temp (ctx, &candidates_p, temp); + + const llama_token new_token_id = llama_sample_token(ctx, &candidates_p); + + //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); + + // is it an end of stream? -> mark the stream as finished + if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) { + i_batch[i] = -1; + LOG_TEE("\n"); + if (n_parallel > 1) { + LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur); + } + + continue; + } + + // if there is only one stream, we print immediately to stdout + if (n_parallel == 1) { + LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + fflush(stdout); + } + + streams[i] += llama_token_to_piece(ctx, new_token_id); + + // push this new token for next evaluation + batch.token [batch.n_tokens] = new_token_id; + batch.pos [batch.n_tokens] = n_cur; + batch.seq_id[batch.n_tokens] = i; + batch.logits[batch.n_tokens] = true; + + i_batch[i] = batch.n_tokens; + + batch.n_tokens += 1; + + n_decode += 1; + } + + // all streams are finished + if (batch.n_tokens == 0) { + break; + } + + n_cur += 1; + + // evaluate the current batch with the transformer model + if (llama_decode(ctx, batch)) { + fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + return 1; + } + } + + LOG_TEE("\n"); + + if (n_parallel > 1) { + LOG_TEE("\n"); + + for (int32_t i = 0; i < n_parallel; ++i) { + LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str()); + } + } + + const auto t_main_end = ggml_time_us(); + + LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); + + llama_print_timings(ctx); + + fprintf(stderr, "\n"); + + llama_batch_free(batch); + + llama_free(ctx); + llama_free_model(model); + + llama_backend_free(); + + return 0; +} diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp index 888ae96656770defa2a5ae107968a9dd5da51527..f078ab8a87fa5fba0c0e21bfc9b257a039fe64c6 100644 --- a/examples/beam-search/beam-search.cpp +++ b/examples/beam-search/beam-search.cpp @@ -158,8 +158,9 @@ int main(int argc, char ** argv) } std::cout << std::flush; - int n_past = llama_get_kv_cache_token_count(ctx); - if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads)) + int n_past = 0; + + if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0))) { fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ ); return 1; @@ -169,7 +170,7 @@ int main(int argc, char ** argv) beam_search_callback_data callback_data{ctx, {}}; size_t const beam_width = static_cast(params.n_beams); int const n_predict = 256; - llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads); + llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict); std::cout << "\n\n"; for (llama_token const token_id : callback_data.response) { diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp index c995eef3514a051621e21f67c5da9d6eefc3e321..99e6bdad5ac45442a05e2663bea6d3ce2b3aeae9 100644 --- a/examples/embd-input/embd-input-lib.cpp +++ b/examples/embd-input/embd-input-lib.cpp @@ -48,8 +48,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) { // print system information { fprintf(stderr, "\n"); - fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", - params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + fprintf(stderr, "%s\n", get_system_info(params).c_str()); } struct MyModel * ret = new MyModel(); ret->ctx = ctx; @@ -71,7 +70,7 @@ bool eval_float(void * model, float * input, int N){ MyModel * mymodel = (MyModel*)model; llama_context * ctx = mymodel->ctx; gpt_params params = mymodel->params; - int n_emb = llama_n_embd(ctx); + int n_emb = llama_n_embd(llama_get_model(ctx)); int n_past = mymodel->n_past; int n_batch = N; // params.n_batch; @@ -80,7 +79,8 @@ bool eval_float(void * model, float * input, int N){ if (n_eval > n_batch) { n_eval = n_batch; } - if (llama_eval_embd(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) { + llama_batch batch = { int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, }; + if (llama_decode(ctx, batch)) { fprintf(stderr, "%s : failed to eval\n", __func__); return false; } @@ -101,7 +101,7 @@ bool eval_tokens(void * model, std::vector tokens) { if (n_eval > params.n_batch) { n_eval = params.n_batch; } - if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) { + if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) { fprintf(stderr, "%s : failed to eval\n", __func__); return false; } @@ -132,7 +132,7 @@ llama_token sampling_id(struct MyModel* mymodel) { // out of user input, sample next token const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k; const float top_p = params.top_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; @@ -148,7 +148,7 @@ llama_token sampling_id(struct MyModel* mymodel) { llama_token id = 0; { auto logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(ctx); + auto n_vocab = llama_n_vocab(llama_get_model(ctx)); // Apply params.logit_bias map for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { @@ -183,11 +183,11 @@ llama_token sampling_id(struct MyModel* mymodel) { if (mirostat == 1) { static float mirostat_mu = 2.0f * mirostat_tau; const int mirostat_m = 100; - llama_sample_temperature(ctx, &candidates_p, temp); + llama_sample_temp(ctx, &candidates_p, temp); id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); } else if (mirostat == 2) { static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temperature(ctx, &candidates_p, temp); + llama_sample_temp(ctx, &candidates_p, temp); id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); } else { // Temperature sampling @@ -195,7 +195,7 @@ llama_token sampling_id(struct MyModel* mymodel) { llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); llama_sample_typical(ctx, &candidates_p, typical_p, 1); llama_sample_top_p(ctx, &candidates_p, top_p, 1); - llama_sample_temperature(ctx, &candidates_p, temp); + llama_sample_temp(ctx, &candidates_p, temp); id = llama_sample_token(ctx, &candidates_p); } } diff --git a/examples/embd-input/embd-input-test.cpp b/examples/embd-input/embd-input-test.cpp index e5e040f62a60a54bf16e1aeb8e4e687b8f933a9e..dc4a0e48854adce86d44a3ba8ce7d2a67996b125 100644 --- a/examples/embd-input/embd-input-test.cpp +++ b/examples/embd-input/embd-input-test.cpp @@ -8,7 +8,7 @@ int main(int argc, char** argv) { auto mymodel = create_mymodel(argc, argv); int N = 10; int max_tgt_len = 500; - int n_embd = llama_n_embd(mymodel->ctx); + int n_embd = llama_n_embd(llama_get_model(mymodel->ctx)); // add random float embd to test evaluation float * data = new float[N*n_embd]; diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 27d605f4e13d69f62432ac1be795fded50d6ff81..14075609ebfd919b534bb2f41b1bd14b2d867673 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -42,17 +42,18 @@ int main(int argc, char ** argv) { return 1; } - const int n_ctx_train = llama_n_ctx_train(ctx); - if (params.n_ctx > n_ctx_train) { + const int n_ctx_train = llama_n_ctx_train(model); + const int n_ctx = llama_n_ctx(ctx); + + if (n_ctx > n_ctx_train) { fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, params.n_ctx); + __func__, n_ctx_train, n_ctx); } // print system information { fprintf(stderr, "\n"); - fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", - params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + fprintf(stderr, "%s\n", get_system_info(params).c_str()); } int n_past = 0; @@ -70,15 +71,15 @@ int main(int argc, char ** argv) { fprintf(stderr, "\n"); } - if (embd_inp.size() > (size_t)params.n_ctx) { + if (embd_inp.size() > (size_t)n_ctx) { fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n", - __func__, embd_inp.size(), params.n_ctx); + __func__, embd_inp.size(), n_ctx); return 1; } while (!embd_inp.empty()) { int n_tokens = std::min(params.n_batch, (int) embd_inp.size()); - if (llama_eval(ctx, embd_inp.data(), n_tokens, n_past, params.n_threads)) { + if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0))) { fprintf(stderr, "%s : failed to eval\n", __func__); return 1; } @@ -86,8 +87,8 @@ int main(int argc, char ** argv) { embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens); } - const int n_embd = llama_n_embd(ctx); - const auto embeddings = llama_get_embeddings(ctx); + const int n_embd = llama_n_embd(model); + const auto * embeddings = llama_get_embeddings(ctx); for (int i = 0; i < n_embd; i++) { printf("%f ", embeddings[i]); diff --git a/examples/export-lora/CMakeLists.txt b/examples/export-lora/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbbdaec67488d7ba6d015d35dcdea4ee3d455c0b --- /dev/null +++ b/examples/export-lora/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET export-lora) +add_executable(${TARGET} export-lora.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0cf3e8e4549bb19e05e20088f14b4167d14148fb --- /dev/null +++ b/examples/export-lora/README.md @@ -0,0 +1,26 @@ +# export-lora + +Apply LORA adapters to base model and export the resulting model. + +``` +usage: export-lora [options] + +options: + -h, --help show this help message and exit + -m FNAME, --model-base FNAME model path from which to load base model (default '') + -o FNAME, --model-out FNAME path to save exported model (default '') + -l FNAME, --lora FNAME apply LoRA adapter + -s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S + -t N, --threads N number of threads to use during computation (default: 4) +``` + +For example: + +```bash +./bin/export-lora \ + -m open-llama-3b-v2-q8_0.gguf \ + -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \ + -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin +``` + +Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters. diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d803cfd5cb2d5e34ead197186b86361d2a1a95c5 --- /dev/null +++ b/examples/export-lora/export-lora.cpp @@ -0,0 +1,474 @@ + +#include "common.h" +#include "ggml.h" +#include "ggml-alloc.h" + +#include +#include +#include + +static const size_t tensor_alignment = 32; + +struct lora_info { + std::string filename; + float scale; +}; + +struct export_lora_params { + std::string fn_model_base; + std::string fn_model_out; + std::vector lora; + int n_threads; +}; + +struct lora_data { + struct lora_info info; + std::vector data; + struct ggml_context * ctx; + + uint32_t lora_r; + uint32_t lora_alpha; +}; + +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + size = 0; + } else { + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, size, 1, fp); + if (ferror(fp)) { + die_fmt("read error: %s", strerror(errno)); + } + if (ret != 1) { + die("unexpectedly reached end of file"); + } + } + + std::uint32_t read_u32() { + std::uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + std::string read_string(std::uint32_t len) { + std::vector chars(len); + read_raw(chars.data(), len); + return std::string(chars.data(), len); + } + + void write_raw(const void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, size, 1, fp); + if (ret != 1) { + die_fmt("write error: %s", strerror(errno)); + } + } + + void write_u32(std::uint32_t val) { + write_raw(&val, sizeof(val)); + } + + bool eof() { + return tell() >= size; + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } +}; + +static struct export_lora_params get_default_export_lora_params() { + struct export_lora_params result; + result.fn_model_base = ""; + result.fn_model_out = ""; + result.n_threads = GGML_DEFAULT_N_THREADS; + return result; +} + +static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -m FNAME, --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base.c_str()); + fprintf(stderr, " -o FNAME, --model-out FNAME path to save exported model (default '%s')\n", params->fn_model_out.c_str()); + fprintf(stderr, " -l FNAME, --lora FNAME apply LoRA adapter\n"); + fprintf(stderr, " -s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S\n"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params->n_threads); +} + +static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) { + bool invalid_param = false; + std::string arg; + struct export_lora_params default_params = get_default_export_lora_params(); + const std::string arg_prefix = "--"; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + + if (arg == "-m" || arg == "--model-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_model_base = argv[i]; + } else if (arg == "-o" || arg == "--model-out") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_model_out = argv[i]; + } else if (arg == "-l" || arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + break; + } + struct lora_info lora; + lora.filename = argv[i]; + lora.scale = 1.0f; + params->lora.push_back(lora); + } else if (arg == "-s" || arg == "--lora-scaled") { + if (++i >= argc) { + invalid_param = true; + break; + } + struct lora_info lora; + lora.filename = argv[i]; + if (++i >= argc) { + invalid_param = true; + break; + } + lora.scale = std::stof(argv[i]); + params->lora.push_back(lora); + } else if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_threads = std::stoi(argv[i]); + if (params->n_threads <= 0) { + params->n_threads = std::thread::hardware_concurrency(); + } + } else { + fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str()); + export_lora_print_usage(argc, argv, &default_params); + exit(1); + } + } + + if (params->fn_model_base == default_params.fn_model_base) { + fprintf(stderr, "error: please specify a filename for model-base.\n"); + export_lora_print_usage(argc, argv, &default_params); + exit(1); + } + if (params->fn_model_out == default_params.fn_model_out) { + fprintf(stderr, "error: please specify a filename for model-out.\n"); + export_lora_print_usage(argc, argv, &default_params); + exit(1); + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str()); + export_lora_print_usage(argc, argv, &default_params); + exit(1); + } + return true; +} + +static void free_lora(struct lora_data * lora) { + if (lora->ctx != NULL) { + ggml_free(lora->ctx); + } + delete lora; +} + +static struct lora_data * load_lora(struct lora_info * info) { + struct lora_data * result = new struct lora_data; + result->info = *info; + result->ctx = NULL; + result->lora_r = 1; + result->lora_alpha = 1; + + struct llama_file file(info->filename.c_str(), "rb"); + if (file.fp == NULL) { + fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n", + info->filename.c_str()); + free_lora(result); + return NULL; + } + + struct ggml_init_params params_ggml; + params_ggml.mem_size = ggml_tensor_overhead() * GGML_MAX_NODES; + params_ggml.mem_buffer = NULL; + params_ggml.no_alloc = true; + result->ctx = ggml_init(params_ggml); + + uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla' + uint32_t magic = file.read_u32(); + if (magic != LLAMA_FILE_MAGIC_LORA) { + die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str()); + } + uint32_t version = file.read_u32(); + if (version != 1) { + die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str()); + } + result->lora_r = file.read_u32(); + result->lora_alpha = file.read_u32(); + // read tensor infos from file + std::vector name_buf; + std::vector tensors; + std::vector tensors_offset; + size_t total_nbytes_pad = 0; + while(!file.eof()) { + int64_t ne[4] = {1,1,1,1}; + uint32_t n_dims = file.read_u32(); + uint32_t namelen = file.read_u32(); + uint32_t type = file.read_u32(); + for (uint32_t k = 0; k < n_dims; ++k) { + ne[k] = (int64_t)file.read_u32(); + } + name_buf.clear(); + name_buf.resize(namelen + 1, '\0'); + file.read_raw(name_buf.data(), namelen); + file.seek((0-file.tell()) & 31, SEEK_CUR); + size_t offset = file.tell(); + struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne); + ggml_set_name(tensor, name_buf.data()); + size_t nbytes = ggml_nbytes(tensor); + size_t nbytes_pad = ggml_nbytes_pad(tensor); + total_nbytes_pad += nbytes_pad; + tensors.push_back(tensor); + tensors_offset.push_back(offset); + file.seek(nbytes, SEEK_CUR); + } + // read tensor data + result->data.resize(total_nbytes_pad); + size_t data_offset = 0; + for (size_t i = 0; i < tensors.size(); ++i) { + struct ggml_tensor * tensor = tensors[i]; + size_t offset = tensors_offset[i]; + size_t nbytes = ggml_nbytes(tensor); + size_t nbytes_pad = ggml_nbytes_pad(tensor); + file.seek(offset, SEEK_SET); + tensor->data = result->data.data() + data_offset; + file.read_raw(tensor->data, nbytes); + data_offset += nbytes_pad; + } + return result; +} + + +static struct ggml_cgraph * build_graph_lora( + struct ggml_context * ctx, + struct ggml_tensor * tensor, + struct ggml_tensor * lora_a, + struct ggml_tensor * lora_b, + float scaling +) { + struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b); + if (scaling != 1.0f) { + ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling)); + } + struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab); + + struct ggml_cgraph * gf = ggml_new_graph(ctx); + ggml_build_forward_expand (gf, res); + return gf; +} + +static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) { + if (lora->ctx == NULL) { + return false; + } + std::string name = ggml_get_name(tensor); + std::string name_a = name + std::string(".loraA"); + std::string name_b = name + std::string(".loraB"); + struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str()); + struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str()); + if (lora_a == NULL || lora_b == NULL) { + return false; + } + + float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r; + + struct ggml_init_params params; + params.mem_size = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5; + params.mem_buffer = NULL; + params.no_alloc = true; + struct ggml_context * ctx = NULL; + struct ggml_allocr * alloc = NULL; + struct ggml_cgraph * gf = NULL; + + ctx = ggml_init(params); + alloc = ggml_allocr_new_measure(tensor_alignment); + gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling); + size_t alloc_size = ggml_allocr_alloc_graph(alloc, gf); + ggml_allocr_free(alloc); + ggml_free(ctx); + + static std::vector data_compute; + data_compute.resize(alloc_size + tensor_alignment); + + ctx = ggml_init(params); + alloc = ggml_allocr_new(data_compute.data(), data_compute.size(), tensor_alignment); + gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling); + ggml_allocr_alloc_graph(alloc, gf); + ggml_allocr_free(alloc); + + struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads); + static std::vector data_work; + data_work.resize(cplan.work_size); + cplan.work_data = data_work.data(); + + ggml_graph_compute(gf, &cplan); + + ggml_free(ctx); + return true; +} + +static void export_lora(struct export_lora_params * params) { + // load all loras + std::vector loras; + for (size_t i = 0; i < params->lora.size(); ++i) { + struct lora_data * lora = load_lora(¶ms->lora[i]); + if (lora != NULL) { + loras.push_back(lora); + } + } + if (loras.size() == 0) { + fprintf(stderr, "warning: no lora adapters will be applied.\n"); + } + + // open input file + struct llama_file fin(params->fn_model_base.c_str(), "rb"); + if (!fin.fp) { + die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str()); + } + + // open base model gguf, read tensors without their data + struct ggml_context * ctx_in; + struct gguf_init_params params_gguf; + params_gguf.no_alloc = true; + params_gguf.ctx = &ctx_in; + struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf); + + // create new gguf + struct gguf_context * gguf_out = gguf_init_empty(); + + // copy meta data from base model: kv and tensors + gguf_set_kv(gguf_out, gguf_in); + int n_tensors = gguf_get_n_tensors(gguf_in); + for (int i=0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(gguf_in, i); + struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name); + gguf_add_tensor(gguf_out, tensor); + } + + // create output file + struct llama_file fout(params->fn_model_out.c_str(), "wb"); + if (!fout.fp) { + die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str()); + } + + // write gguf meta data + std::vector meta; + meta.resize(gguf_get_meta_size(gguf_out)); + gguf_get_meta_data(gguf_out, meta.data()); + fout.write_raw(meta.data(), meta.size()); + + std::vector data; + std::vector padding; + for (int i=0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(gguf_in, i); + struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name); + + // read tensor data + data.resize(ggml_nbytes(tensor)); + tensor->data = data.data(); + size_t offset = gguf_get_tensor_offset(gguf_in, i); + fin.seek(offset + meta.size(), SEEK_SET); + fin.read_raw(data.data(), data.size()); + + // apply all loras + for (size_t k = 0; k < loras.size(); ++k) { + apply_lora(tensor, loras[k], params->n_threads); + } + + // write tensor data + padding + padding.clear(); + padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0); + + GGML_ASSERT(fout.tell() == offset + meta.size()); + // fout.seek(offset + meta.size(), SEEK_SET); + fout.write_raw(data.data(), data.size()); + fout.write_raw(padding.data(), padding.size()); + + if (i % 2 == 0) { + printf("."); + } + } + printf("\n"); + + // close gguf + gguf_free(gguf_out); + gguf_free(gguf_in); + + // free loras + for (size_t i = 0; i < loras.size(); ++i) { + free_lora(loras[i]); + } +} + +int main(int argc, char ** argv) { + struct export_lora_params params = get_default_export_lora_params(); + + if (!export_lora_params_parse(argc, argv, ¶ms)) { + return 1; + } + + export_lora(¶ms); + + return 0; +} diff --git a/examples/finetune/CMakeLists.txt b/examples/finetune/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b52d21cfb381d5137ea2522350561cac72135e9 --- /dev/null +++ b/examples/finetune/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET finetune) +add_executable(${TARGET} finetune.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/finetune/README.md b/examples/finetune/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b7347c20ca0ab40c18c496addbb9b1d8d574a042 --- /dev/null +++ b/examples/finetune/README.md @@ -0,0 +1,90 @@ +# finetune + +Basic usage instructions: + +```bash +# get training data +wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt + +# finetune LORA adapter +./bin/finetune \ + --model-base open-llama-3b-v2-q8_0.gguf \ + --checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \ + --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \ + --lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \ + --train-data "shakespeare.txt" \ + --save-every 10 \ + --threads 6 --adam-iter 30 --batch 4 --ctx 64 \ + --use-checkpointing + +# predict +./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin +``` + +Finetune output files will be saved every N iterations (config with `--save-every N`). +The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output. +So in above example after 10 iterations these files will be written: +- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf +- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf +- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin +- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin + +After 10 more iterations: +- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf +- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf +- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin +- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin + +Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter. + +llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`. +These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above. + +In `main` you can also load multiple LORA adapters, which will then be mixed together. + +For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this: + +```bash +./bin/main -m open-llama-3b-v2-q8_0.gguf \ + --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \ + --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin +``` + +You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`. + +For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one: + +```bash +./bin/main -m open-llama-3b-v2-q8_0.gguf \ + --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \ + --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \ + --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin +``` + +The scale numbers don't need to add up to one, and you can also use numbers creater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values. + +Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime. +If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`. + +The default LORA rank can be specified with `--lora-r N`. +The LORA rank can be configured for each model tensor type separately with these command line options: + +```bash + --lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4) + --rank-att-norm N LORA rank for attention norm tensor (default 1) + --rank-ffn-norm N LORA rank for feed-forward norm tensor (default 1) + --rank-out-norm N LORA rank for output norm tensor (default 1) + --rank-tok-embd N LORA rank for token embeddings tensor (default 4) + --rank-out N LORA rank for output tensor (default 4) + --rank-wq N LORA rank for wq tensor (default 4) + --rank-wk N LORA rank for wk tensor (default 4) + --rank-wv N LORA rank for wv tensor (default 4) + --rank-wo N LORA rank for wo tensor (default 4) + --rank-w1 N LORA rank for w1 tensor (default 4) + --rank-w2 N LORA rank for w2 tensor (default 4) + --rank-w3 N LORA rank for w3 tensor (default 4) +``` + +The LORA rank of 'norm' tensors should always be 1. + +To see all available options use `finetune --help`. diff --git a/examples/finetune/convert-finetune-checkpoint-to-gguf.py b/examples/finetune/convert-finetune-checkpoint-to-gguf.py new file mode 100644 index 0000000000000000000000000000000000000000..96d6633ed7d5ee0cd38d93c33c1439614b5bfa51 --- /dev/null +++ b/examples/finetune/convert-finetune-checkpoint-to-gguf.py @@ -0,0 +1,489 @@ +#!/usr/bin/env python3 +# finetune checkpoint --> gguf conversion + +import argparse +import gguf +import os +import struct +import sys +import numpy as np +from pathlib import Path + +# gguf constants +LLM_KV_OPTIMIZER_TYPE = "optimizer.type" +LLM_KV_OPTIMIZER_TYPE_ADAM = "adam" +LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs" +LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version" +LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count" +LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count" +LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count" +LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized" +LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss" +LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss" +LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count" +LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count" +LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss" +LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step" +LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j" +LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k" +LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end" +LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count" + +LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments" +LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments" +LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values" + +LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters" +LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters" +LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients" +LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients" +LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction" +LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values" +LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha" +LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys" +LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s" +LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y" + +LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model" +LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora" +LLM_KV_TRAINING_TYPE = "training.type" +LLM_KV_TRAINING_FILE_VERSION = "training.file_version" +LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count" +LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count" +LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count" + +LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd" +LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm" +LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output" +LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm" +LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q" +LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k" +LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v" +LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output" +LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm" +LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate" +LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down" +LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up" + +class Tensor: + def __init__(self, dtype='f', ne=None): + if ne is None: + ne = [] + self.dtype = dtype + self.ne = ne + self.nbytes = 0 + if self.dtype == 'f': + if len(self.ne) == 0: + self.nbytes = 0 + else: + self.nbytes = int(np.product(self.ne)) * 4 + else: + raise ValueError(f"Unhandled data type '{self.dtype}'") + + def load(self, data, offset): + nd = struct.unpack(' 0 else []) + + self.lbfgs_x = Tensor('f', [self.nx]) + self.lbfgs_xp = Tensor('f', [self.nx]) + self.lbfgs_g = Tensor('f', [self.nx]) + self.lbfgs_gp = Tensor('f', [self.nx]) + self.lbfgs_d = Tensor('f', [self.nx]) + self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else []) + self.lbfgs_lmal = Tensor('f', [self.lbfgs_m]) + self.lbfgs_lmys = Tensor('f', [self.lbfgs_m]) + self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m]) + self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m]) + + # forgot to save type in version 1: + # guess self.type from number of remaining bytes + size_type_0 = 12 + sum([t.max_storage_size() for t in + [self.adam_m, self.adam_v] + +([self.adam_pf] if (self.past > 0) else [])]) + size_type_1 = 24 + sum([t.max_storage_size() for t in + [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g, + self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf, + self.lbfgs_lmal, self.lbfgs_lmys, + self.lbfgs_lms, self.lbfgs_lmy] + +([self.lbfgs_pf] if (self.past > 0) else [])]) + # due to alignment padding the size might not by exact + # but the difference in size for both types is significant, + # so we can just use whichever is closest + remaining = len(data) - offset + if abs(remaining - size_type_0) < abs(remaining - size_type_1): + self.type = 0 + else: + self.type = 1 + + if self.type == 0: + offset = self.adam_m.load(data, offset) + offset = self.adam_v.load(data, offset) + offset = self.adam_pf.load(data,offset) + + self.adam_fx_best = struct.unpack(' 0: + self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES) + + elif self.type == 1: + gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS) + gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m) + gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best) + gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step) + gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j) + gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k) + gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end) + gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement) + + self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS) + self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS) + self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS) + self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS) + self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION) + if self.past > 0: + self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES) + self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA) + self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS) + self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S) + self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y) + else: + raise ValueError('Unknown optimizer type') + +class LoraParams: + def __init__(self): + pass + + def load(self, data, offset): + self.n_rank_attention_norm = struct.unpack(' +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +static const size_t tensor_alignment = 32; + +struct my_llama_hparams { + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; + uint32_t n_embd = 4096; + uint32_t n_ff = 11008; + uint32_t n_head = 32; + uint32_t n_head_kv = 32; + uint32_t n_layer = 32; + + // float f_norm_eps = 1e-5f; // falcon + float f_norm_rms_eps = 1e-5f; // llama + + float rope_freq_base = 10000.0f; + float rope_freq_scale = 1.0f; + + uint32_t n_gqa() const { + return n_head/n_head_kv; + } + + uint32_t n_embd_head() const { + return n_embd/n_head; + } + + uint32_t n_embd_gqa() const { + return n_embd/n_gqa(); + } + + bool operator!=(const my_llama_hparams& other) const { + return memcmp(this, &other, sizeof(other)); + } +}; + +struct my_llama_layer { + // normalization + struct ggml_tensor * attention_norm; + + // attention + struct ggml_tensor * wq; + struct ggml_tensor * wk; + struct ggml_tensor * wv; + struct ggml_tensor * wo; + + // normalization + struct ggml_tensor * ffn_norm; + + // ff + struct ggml_tensor * w1; + struct ggml_tensor * w2; + struct ggml_tensor * w3; +}; + +struct my_llama_model { + struct my_llama_hparams hparams; + + struct ggml_tensor * tok_embeddings; + + struct ggml_tensor * norm; + struct ggml_tensor * output; + + std::vector layers; +}; + +struct my_llama_lora_hparams { + uint32_t lora_r = 1; + uint32_t lora_alpha = 1; + uint32_t n_rank_attention_norm = 1; + uint32_t n_rank_wq = 4; + uint32_t n_rank_wk = 4; + uint32_t n_rank_wv = 4; + uint32_t n_rank_wo = 4; + uint32_t n_rank_ffn_norm = 1; + uint32_t n_rank_w1 = 4; + uint32_t n_rank_w2 = 4; + uint32_t n_rank_w3 = 4; + uint32_t n_rank_tok_embeddings = 4; + uint32_t n_rank_norm = 1; + uint32_t n_rank_output = 4; + + bool operator!=(const my_llama_lora_hparams& other) const { + return memcmp(this, &other, sizeof(other)); + } +}; + +struct my_llama_lora_layer { + // normalization + struct ggml_tensor * attention_norm_a; + struct ggml_tensor * attention_norm_b; + + // attention + struct ggml_tensor * wq_a; + struct ggml_tensor * wq_b; + struct ggml_tensor * wk_a; + struct ggml_tensor * wk_b; + struct ggml_tensor * wv_a; + struct ggml_tensor * wv_b; + struct ggml_tensor * wo_a; + struct ggml_tensor * wo_b; + + // normalization + struct ggml_tensor * ffn_norm_a; + struct ggml_tensor * ffn_norm_b; + + // ff + struct ggml_tensor * w1_a; + struct ggml_tensor * w1_b; + struct ggml_tensor * w2_a; + struct ggml_tensor * w2_b; + struct ggml_tensor * w3_a; + struct ggml_tensor * w3_b; +}; + +struct my_llama_lora { + struct ggml_context * ctx = NULL; + std::vector data; + + my_llama_lora_hparams hparams; + + struct ggml_tensor * tok_embeddings_a; + struct ggml_tensor * tok_embeddings_b; + + struct ggml_tensor * norm_a; + struct ggml_tensor * norm_b; + struct ggml_tensor * output_a; + struct ggml_tensor * output_b; + + std::vector layers; +}; + +// gguf constants +static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"; +static const char * LLM_KV_TRAINING_TYPE = "training.type"; + +static const char * LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd"; +static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"; +static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output"; +static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm"; +static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q"; +static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k"; +static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v"; +static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output"; +static const char * LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm"; +static const char * LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate"; +static const char * LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down"; +static const char * LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up"; + +// gguf constants (sync with gguf.py) + +static const char * LLM_KV_GENERAL_ARCHITECTURE = "general.architecture"; +static const char * LLM_KV_GENERAL_FILE_TYPE = "general.file_type"; + +static const char * LLM_KV_CONTEXT_LENGTH = "%s.context_length"; +static const char * LLM_KV_EMBEDDING_LENGTH = "%s.embedding_length"; +static const char * LLM_KV_BLOCK_COUNT = "%s.block_count"; +static const char * LLM_KV_FEED_FORWARD_LENGTH = "%s.feed_forward_length"; +static const char * LLM_KV_ATTENTION_HEAD_COUNT = "%s.attention.head_count"; +static const char * LLM_KV_ATTENTION_HEAD_COUNT_KV = "%s.attention.head_count_kv"; +static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon"; +static const char * LLM_KV_ROPE_DIMENSION_COUNT = "%s.rope.dimension_count"; +static const char * LLM_KV_ROPE_FREQ_BASE = "%s.rope.freq_base"; // TODO load in llama.cpp +static const char * LLM_KV_ROPE_SCALE_LINEAR = "%s.rope.scale_linear"; + +static const char * LLM_TENSOR_TOKEN_EMBD = "token_embd"; +static const char * LLM_TENSOR_OUTPUT_NORM = "output_norm"; +static const char * LLM_TENSOR_OUTPUT = "output"; +static const char * LLM_TENSOR_ATTN_NORM = "blk.%d.attn_norm"; +static const char * LLM_TENSOR_ATTN_Q = "blk.%d.attn_q"; +static const char * LLM_TENSOR_ATTN_K = "blk.%d.attn_k"; +static const char * LLM_TENSOR_ATTN_V = "blk.%d.attn_v"; +static const char * LLM_TENSOR_ATTN_OUT = "blk.%d.attn_output"; +static const char * LLM_TENSOR_FFN_NORM = "blk.%d.ffn_norm"; +static const char * LLM_TENSOR_FFN_GATE = "blk.%d.ffn_gate"; +static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down"; +static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; + +static void print_params(struct my_llama_hparams * params) { + printf("%s: n_vocab: %u\n", __func__, params->n_vocab); + printf("%s: n_ctx: %u\n", __func__, params->n_ctx); + printf("%s: n_embd: %u\n", __func__, params->n_embd); + printf("%s: n_ff: %u\n", __func__, params->n_ff); + printf("%s: n_head: %u\n", __func__, params->n_head); + printf("%s: n_head_kv: %u\n", __func__, params->n_head_kv); + printf("%s: n_layer: %u\n", __func__, params->n_layer); + printf("%s: norm_rms_eps : %f\n", __func__, params->f_norm_rms_eps); + printf("%s: rope_freq_base : %f\n", __func__, params->rope_freq_base); + printf("%s: rope_freq_scale : %f\n", __func__, params->rope_freq_scale); +} + +static void print_lora_params(struct my_llama_lora_hparams * params) { + printf("%s: n_rank_attention_norm : %u\n", __func__, params->n_rank_attention_norm); + printf("%s: n_rank_wq : %u\n", __func__, params->n_rank_wq); + printf("%s: n_rank_wk : %u\n", __func__, params->n_rank_wk); + printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv); + printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo); + printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm); + printf("%s: n_rank_w1 : %u\n", __func__, params->n_rank_w1); + printf("%s: n_rank_w2 : %u\n", __func__, params->n_rank_w2); + printf("%s: n_rank_w3 : %u\n", __func__, params->n_rank_w3); + printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings); + printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm); + printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output); +} + +#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ +{ \ + const std::string skey(key); \ + const int kid = gguf_find_key(ctx, skey.c_str()); \ + if (kid >= 0) { \ + enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ + if (ktype != (type)) { \ + die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \ + } \ + (dst) = func(ctx, kid); \ + } else if (req) { \ + die_fmt("key not found in model: %s", skey.c_str()); \ + } \ +} + +static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_hparams * hparams, const char * expected_arch) { + std::string arch; + + GGUF_GET_KEY(ctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE); + if (expected_arch != NULL) { + if (arch != expected_arch) { + printf("%s: arch=%s expected_arch=%s\n", __func__, arch.c_str(), expected_arch); + } + GGML_ASSERT(arch == expected_arch); + } + + std::vector keybuf; + keybuf.resize(512); + auto kv = [&arch, &keybuf](const char * key) -> const char * { + snprintf(keybuf.data(), keybuf.size(), key, arch.c_str()); + return keybuf.data(); + }; + + GGUF_GET_KEY(ctx, hparams->n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); + GGUF_GET_KEY(ctx, hparams->n_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH)); + GGUF_GET_KEY(ctx, hparams->n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); + GGUF_GET_KEY(ctx, hparams->n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); + GGUF_GET_KEY(ctx, hparams->n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); + + // n_head_kv is optional, default to n_head + hparams->n_head_kv = hparams->n_head; + GGUF_GET_KEY(ctx, hparams->n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); + + float rope_freq_scale = 1.0f; + GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); + GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); + GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); + if (rope_freq_scale != 1.0f) { + hparams->rope_freq_scale = 1.0f / rope_freq_scale; + } +} + +static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t n_ctx) { + auto & hparams = model->hparams; + + std::vector tn_buf; + tn_buf.resize(GGML_MAX_NAME); + auto tn = [&tn_buf](const char * key) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key); + return tn_buf.data(); + }; + auto tni = [&tn_buf](const char * key, int bid) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), key, bid); + std::string s = tn_buf.data(); + snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str()); + return tn_buf.data(); + }; + + + // get parameters directly from gguf file + { + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ NULL, + }; + struct gguf_context * mctx = gguf_init_from_file(fn_model, params); + + load_model_hparams_gguf(mctx, &hparams, "llama"); + + gguf_free(mctx); + } + hparams.n_vocab = llama_n_vocab(input); + hparams.n_ctx = n_ctx; + + // get tensors from llama_model (possibly mmapped) + model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD)); + model->norm = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT_NORM)); + model->output = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT)); + + assert_shape_2d(model->tok_embeddings, hparams.n_embd, hparams.n_vocab); + assert_shape_1d(model->norm, hparams.n_embd); + assert_shape_2d(model->output, hparams.n_embd, hparams.n_vocab); + + model->layers.resize(hparams.n_layer); + for (uint32_t i = 0; i < hparams.n_layer; ++i) { + auto & layer = model->layers[i]; + + layer.attention_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_NORM, i)); + layer.wq = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_Q, i)); + layer.wk = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_K, i)); + layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i)); + layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i)); + layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i)); + layer.w1 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i)); + layer.w2 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i)); + layer.w3 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i)); + + assert_shape_1d(layer.attention_norm, hparams.n_embd); + assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd); + assert_shape_2d(layer.wk, hparams.n_embd, hparams.n_embd); + assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd); + assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd); + assert_shape_1d(layer.ffn_norm, hparams.n_embd); + assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff); + assert_shape_2d(layer.w2, hparams.n_ff, hparams.n_embd); + assert_shape_2d(layer.w3, hparams.n_embd, hparams.n_ff); + } +} + +static void set_param_lora(struct my_llama_lora * lora) { + const uint32_t n_layer = lora->layers.size(); + + struct ggml_context* ctx = lora->ctx; + + ggml_set_param(ctx, lora->tok_embeddings_a); + ggml_set_param(ctx, lora->tok_embeddings_b); + ggml_set_param(ctx, lora->norm_a); + ggml_set_param(ctx, lora->norm_b); + ggml_set_param(ctx, lora->output_a); + ggml_set_param(ctx, lora->output_b); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = lora->layers[i]; + + ggml_set_param(ctx, layer.attention_norm_a); + ggml_set_param(ctx, layer.attention_norm_b); + ggml_set_param(ctx, layer.wq_a); + ggml_set_param(ctx, layer.wq_b); + ggml_set_param(ctx, layer.wk_a); + ggml_set_param(ctx, layer.wk_b); + ggml_set_param(ctx, layer.wv_a); + ggml_set_param(ctx, layer.wv_b); + ggml_set_param(ctx, layer.wo_a); + ggml_set_param(ctx, layer.wo_b); + ggml_set_param(ctx, layer.ffn_norm_a); + ggml_set_param(ctx, layer.ffn_norm_b); + ggml_set_param(ctx, layer.w1_a); + ggml_set_param(ctx, layer.w1_b); + ggml_set_param(ctx, layer.w2_a); + ggml_set_param(ctx, layer.w2_b); + ggml_set_param(ctx, layer.w3_a); + ggml_set_param(ctx, layer.w3_b); + } +} + +static void alloc_lora(struct ggml_allocr * alloc, struct my_llama_lora * lora) { + ggml_allocr_alloc(alloc, lora->tok_embeddings_a); + ggml_allocr_alloc(alloc, lora->tok_embeddings_b); + ggml_allocr_alloc(alloc, lora->norm_a); + ggml_allocr_alloc(alloc, lora->norm_b); + ggml_allocr_alloc(alloc, lora->output_a); + ggml_allocr_alloc(alloc, lora->output_b); + for (uint32_t i = 0; i < lora->layers.size(); ++i) { + auto & layer = lora->layers[i]; + ggml_allocr_alloc(alloc, layer.attention_norm_a); + ggml_allocr_alloc(alloc, layer.attention_norm_b); + ggml_allocr_alloc(alloc, layer.wq_a); + ggml_allocr_alloc(alloc, layer.wq_b); + ggml_allocr_alloc(alloc, layer.wk_a); + ggml_allocr_alloc(alloc, layer.wk_b); + ggml_allocr_alloc(alloc, layer.wv_a); + ggml_allocr_alloc(alloc, layer.wv_b); + ggml_allocr_alloc(alloc, layer.wo_a); + ggml_allocr_alloc(alloc, layer.wo_b); + ggml_allocr_alloc(alloc, layer.ffn_norm_a); + ggml_allocr_alloc(alloc, layer.ffn_norm_b); + ggml_allocr_alloc(alloc, layer.w1_a); + ggml_allocr_alloc(alloc, layer.w1_b); + ggml_allocr_alloc(alloc, layer.w2_a); + ggml_allocr_alloc(alloc, layer.w2_b); + ggml_allocr_alloc(alloc, layer.w3_a); + ggml_allocr_alloc(alloc, layer.w3_b); + } + ggml_allocr_alloc(alloc, lora->tok_embeddings_a->grad); + ggml_allocr_alloc(alloc, lora->tok_embeddings_b->grad); + ggml_allocr_alloc(alloc, lora->norm_a->grad); + ggml_allocr_alloc(alloc, lora->norm_b->grad); + ggml_allocr_alloc(alloc, lora->output_a->grad); + ggml_allocr_alloc(alloc, lora->output_b->grad); + for (uint32_t i = 0; i < lora->layers.size(); ++i) { + auto & layer = lora->layers[i]; + ggml_allocr_alloc(alloc, layer.attention_norm_a->grad); + ggml_allocr_alloc(alloc, layer.attention_norm_b->grad); + ggml_allocr_alloc(alloc, layer.wq_a->grad); + ggml_allocr_alloc(alloc, layer.wq_b->grad); + ggml_allocr_alloc(alloc, layer.wk_a->grad); + ggml_allocr_alloc(alloc, layer.wk_b->grad); + ggml_allocr_alloc(alloc, layer.wv_a->grad); + ggml_allocr_alloc(alloc, layer.wv_b->grad); + ggml_allocr_alloc(alloc, layer.wo_a->grad); + ggml_allocr_alloc(alloc, layer.wo_b->grad); + ggml_allocr_alloc(alloc, layer.ffn_norm_a->grad); + ggml_allocr_alloc(alloc, layer.ffn_norm_b->grad); + ggml_allocr_alloc(alloc, layer.w1_a->grad); + ggml_allocr_alloc(alloc, layer.w1_b->grad); + ggml_allocr_alloc(alloc, layer.w2_a->grad); + ggml_allocr_alloc(alloc, layer.w2_b->grad); + ggml_allocr_alloc(alloc, layer.w3_a->grad); + ggml_allocr_alloc(alloc, layer.w3_b->grad); + } +} + +static void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) { + const auto & lparams = lora->hparams; + + const uint32_t n_embd = model->hparams.n_embd; + const uint32_t n_embd_gqa = model->hparams.n_embd_gqa(); + const uint32_t n_layer = model->hparams.n_layer; + const uint32_t n_vocab = model->hparams.n_vocab; + const uint32_t n_ff = model->hparams.n_ff; + + std::vector tn_buf; + tn_buf.resize(GGML_MAX_NAME); + auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix); + return tn_buf.data(); + }; + auto tni = [&tn_buf](const char * key, const char * suffix, int bid) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), key, bid); + std::string s = tn_buf.data(); + snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix); + return tn_buf.data(); + }; + + // context for lora tensors without their data + struct ggml_init_params ctx_lora_params; + ctx_lora_params.mem_size = ggml_tensor_overhead()*2*(6 + n_layer*18); + ctx_lora_params.mem_buffer = NULL; + ctx_lora_params.no_alloc = true; + + struct ggml_context * ctx = ggml_init(ctx_lora_params); + lora->ctx = ctx; + + lora->tok_embeddings_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_embd); + lora->tok_embeddings_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_vocab); + lora->norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, n_embd); + lora->norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, 1); + lora->output_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_embd); + lora->output_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_vocab); + + ggml_set_name(lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.lora_a")); + ggml_set_name(lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.lora_b")); + ggml_set_name(lora->norm_a, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_a")); + ggml_set_name(lora->norm_b, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_b")); + ggml_set_name(lora->output_a, tn(LLM_TENSOR_OUTPUT, ".weight.lora_a")); + ggml_set_name(lora->output_b, tn(LLM_TENSOR_OUTPUT, ".weight.lora_b")); + + lora->layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = lora->layers[i]; + + layer.attention_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, n_embd); + layer.attention_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, 1); + + layer.wq_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd); + layer.wq_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd); + layer.wk_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd); + layer.wk_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd_gqa); + layer.wv_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd); + layer.wv_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd_gqa); + layer.wo_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd); + layer.wo_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd); + + layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd); + layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1); + + layer.w1_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_embd); + layer.w1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_ff); + layer.w2_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_ff); + layer.w2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_embd); + layer.w3_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_embd); + layer.w3_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_ff); + + ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i)); + ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i)); + ggml_set_name(layer.wq_a, tni(LLM_TENSOR_ATTN_Q, ".weight.lora_a", i)); + ggml_set_name(layer.wq_b, tni(LLM_TENSOR_ATTN_Q, ".weight.lora_b", i)); + ggml_set_name(layer.wk_a, tni(LLM_TENSOR_ATTN_K, ".weight.lora_a", i)); + ggml_set_name(layer.wk_b, tni(LLM_TENSOR_ATTN_K, ".weight.lora_b", i)); + ggml_set_name(layer.wv_a, tni(LLM_TENSOR_ATTN_V, ".weight.lora_a", i)); + ggml_set_name(layer.wv_b, tni(LLM_TENSOR_ATTN_V, ".weight.lora_b", i)); + ggml_set_name(layer.wo_a, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_a", i)); + ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i)); + ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i)); + ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i)); + ggml_set_name(layer.w1_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i)); + ggml_set_name(layer.w1_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i)); + ggml_set_name(layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i)); + ggml_set_name(layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i)); + ggml_set_name(layer.w3_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i)); + ggml_set_name(layer.w3_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i)); + } + + set_param_lora(lora); + + // measure data size + struct ggml_allocr * alloc = NULL; + alloc = ggml_allocr_new_measure(tensor_alignment); + alloc_lora(alloc, lora); + + // allocate data + lora->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment); + ggml_allocr_free(alloc); + alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment); + alloc_lora(alloc, lora); + ggml_allocr_free(alloc); +} + +static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) { + const uint32_t n_layer = lora->layers.size(); + + struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); + + randomize_tensor_normal(lora->tok_embeddings_a, rnd); + randomize_tensor_normal(lora->tok_embeddings_b, rnd); + randomize_tensor_normal(lora->norm_a, rnd); + randomize_tensor_normal(lora->norm_b, rnd); + randomize_tensor_normal(lora->output_a, rnd); + randomize_tensor_normal(lora->output_b, rnd); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = lora->layers[i]; + randomize_tensor_normal(layer.attention_norm_a, rnd); + randomize_tensor_normal(layer.attention_norm_b, rnd); + + randomize_tensor_normal(layer.wq_a, rnd); + randomize_tensor_normal(layer.wq_b, rnd); + randomize_tensor_normal(layer.wk_a, rnd); + randomize_tensor_normal(layer.wk_b, rnd); + randomize_tensor_normal(layer.wv_a, rnd); + randomize_tensor_normal(layer.wv_b, rnd); + randomize_tensor_normal(layer.wo_a, rnd); + randomize_tensor_normal(layer.wo_b, rnd); + + randomize_tensor_normal(layer.ffn_norm_a, rnd); + randomize_tensor_normal(layer.ffn_norm_b, rnd); + + randomize_tensor_normal(layer.w1_a, rnd); + randomize_tensor_normal(layer.w1_b, rnd); + randomize_tensor_normal(layer.w2_a, rnd); + randomize_tensor_normal(layer.w2_b, rnd); + randomize_tensor_normal(layer.w3_a, rnd); + randomize_tensor_normal(layer.w3_b, rnd); + } + + free_random_normal_distribution(rnd); +} + +static struct ggml_tensor * llama_build_lora_finetune_graphs( + struct my_llama_model * model, + struct my_llama_lora * lora, + struct ggml_allocr * alloc, + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + struct ggml_cgraph * gb_tmp, + struct ggml_tensor * * logits, + struct ggml_tensor * tokens_input, + struct ggml_tensor * targets, + const int n_tokens, + const int n_batch, + const bool enable_flash_attn, + const bool enable_checkpointing) { + + ggml_set_scratch(ctx, { 0, 0, nullptr, }); + const int n_past = 0; + const int N = n_tokens; + const auto & hparams = model->hparams; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_head_kv = hparams.n_head_kv; + const int n_ff = hparams.n_ff; + const int n_rot = hparams.n_embd_head(); + const int n_embd_head = hparams.n_embd_head(); + const int n_embd_gqa = hparams.n_embd_gqa(); + const float rms_norm_eps = hparams.f_norm_rms_eps; + const float rope_freq_base = hparams.rope_freq_base; + const float rope_freq_scale = hparams.rope_freq_scale; + + GGML_ASSERT((size_t) n_layer == lora->layers.size()); + + auto set_name = [](struct ggml_tensor * t, const char * n) { + ggml_set_name(t, n); + if (t->grad) { + ggml_format_name(t->grad, "%s->grad", n); + } + }; + + // KQ_pos - contains the positions + struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); + ggml_allocr_alloc(alloc, KQ_pos); + if (!ggml_allocr_is_measure(alloc)) { + int * data = (int *) KQ_pos->data; + for (int i = 0; i < N; ++i) { + data[i] = n_past + i; + } + } + + // rope has so much parameters that we make a custom function for it + auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale] + (struct ggml_tensor * t) -> struct ggml_tensor * { + // not capturing these, to silcence warnings + const int rope_mode = 0; + + return ggml_rope_custom(ctx, + t, KQ_pos, n_rot, rope_mode, n_ctx, + rope_freq_base, rope_freq_scale); + }; + + set_name(tokens_input, "tokens_input"); + set_name(targets, "targets"); + + GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); + + auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { + if (ggml_is_quantized(a->type)) { + return ggml_add_cast(ctx, a, b, GGML_TYPE_F32); + } else if (a->type == GGML_TYPE_F32) { + return ggml_add(ctx, a, b); + } else { + die_fmt("%s: Finetuning on tensors with type '%s' is not yet supported.\n", + __func__, ggml_type_name(a->type)); + } + }; + + struct ggml_tensor * tok_embeddings = add_to_f32(ctx, model->tok_embeddings, ggml_mul_mat(ctx, lora->tok_embeddings_a, lora->tok_embeddings_b)); + struct ggml_tensor * norm = add_to_f32(ctx, model->norm, ggml_mul_mat(ctx, lora->norm_a, lora->norm_b)); + struct ggml_tensor * output = add_to_f32(ctx, model->output, ggml_mul_mat(ctx, lora->output_a, lora->output_b)); + + struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch); set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch); + struct ggml_tensor * t01 = ggml_get_rows(ctx, tok_embeddings, t00); set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch); + + struct ggml_tensor * cur = t01; + + std::vector checkpoints; + if (enable_checkpointing) { + checkpoints.push_back(tokens_input); + checkpoints.push_back(targets); + checkpoints.push_back(t00); + checkpoints.push_back(t01); + } + + struct ggml_tensor * kv_scale = NULL; + if (!enable_flash_attn) { + kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head)); + } + + for (int il = 0; il < n_layer; ++il) { + struct my_llama_layer & layer = model->layers[il]; + struct my_llama_lora_layer & llayer = lora->layers[il]; + + struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b)); + struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b)); + struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b)); + struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b)); + struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b)); + struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b)); + struct ggml_tensor * w1 = add_to_f32(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b)); + struct ggml_tensor * w2 = add_to_f32(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b)); + struct ggml_tensor * w3 = add_to_f32(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b)); + + struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch); + struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch); + struct ggml_tensor * t04 = ggml_mul (ctx, t03, t02); set_name(t04, "t04"); assert_shape_2d(t04, n_embd, N*n_batch); + struct ggml_tensor * t05 = ggml_mul_mat (ctx, wq, t04); set_name(t05, "t05"); assert_shape_2d(t05, n_embd, N*n_batch); + struct ggml_tensor * t06 = ggml_reshape_4d (ctx, t05, n_embd_head, n_head, N, n_batch); set_name(t06, "t06"); assert_shape_4d(t06, n_embd_head, n_head, N, n_batch); + struct ggml_tensor * t07 = rope (t06); set_name(t07, "t07"); assert_shape_4d(t07, n_embd_head, n_head, N, n_batch); + struct ggml_tensor * t08 = ggml_mul_mat (ctx, wk, t04); set_name(t08, "t08"); assert_shape_2d(t08, n_embd_gqa, N*n_batch); + struct ggml_tensor * t09 = ggml_reshape_4d (ctx, t08, n_embd_head, n_head_kv, N, n_batch); set_name(t09, "t09"); assert_shape_4d(t09, n_embd_head, n_head_kv, N, n_batch); + struct ggml_tensor * t10 = rope (t09); set_name(t10, "t10"); assert_shape_4d(t10, n_embd_head, n_head_kv, N, n_batch); + + struct ggml_tensor * t11; + if (ggml_is_quantized(wv->type)) { + struct ggml_tensor * t11_1 = ggml_mul_mat (ctx, wv, t04); set_name(t11_1, "t11_1"); assert_shape_2d(t11_1, n_embd_gqa, N*n_batch); + struct ggml_tensor * t11_2 = ggml_transpose(ctx, t11_1); set_name(t11_2, "t11_2"); assert_shape_2d(t11_2, N*n_batch, n_embd_gqa); + t11 = ggml_cont (ctx, t11_2); set_name(t11, "t11"); assert_shape_2d(t11, N*n_batch, n_embd_gqa); + } else { + t11 = ggml_mul_mat (ctx, t04, wv); set_name(t11, "t11"); assert_shape_2d(t11, N*n_batch, n_embd_gqa); + } + + struct ggml_tensor * t12 = ggml_reshape_4d (ctx, t11, N, n_batch, n_embd_head, n_head_kv); set_name(t12, "t12"); assert_shape_4d(t12, N, n_batch, n_embd_head, n_head_kv); + struct ggml_tensor * t13 = ggml_permute (ctx, t07, 0, 2, 1, 3); set_name(t13, "t13"); assert_shape_4d(t13, n_embd_head, N, n_head, n_batch); + struct ggml_tensor * t14 = ggml_permute (ctx, t10, 0, 2, 1, 3); set_name(t14, "t14"); assert_shape_4d(t14, n_embd_head, N, n_head_kv, n_batch); + struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch); + struct ggml_tensor * t16; + if (enable_flash_attn) { + t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch); + } else { + struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch); + struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch); + struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past); set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch); + struct ggml_tensor * t16_3 = ggml_soft_max_inplace (ctx, t16_2); set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch); + t16 = ggml_mul_mat(ctx, t15, t16_3); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch); + } + struct ggml_tensor * t17 = ggml_permute (ctx, t16, 0, 2, 1, 3); set_name(t17, "t17"); assert_shape_4d(t17, n_embd_head, n_head, N, n_batch); + struct ggml_tensor * t18 = ggml_cont (ctx, t17); set_name(t18, "t18"); assert_shape_4d(t18, n_embd_head, n_head, N, n_batch); + struct ggml_tensor * t19 = ggml_reshape_2d (ctx, t18, n_embd, N*n_batch); set_name(t19, "t19"); assert_shape_2d(t19, n_embd, N*n_batch); + struct ggml_tensor * t20 = ggml_mul_mat (ctx, wo, t19); set_name(t20, "t20"); assert_shape_2d(t20, n_embd, N*n_batch); + struct ggml_tensor * t21 = ggml_add (ctx, t20, cur); set_name(t21, "t21"); assert_shape_2d(t21, n_embd, N*n_batch); + struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); + struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); + struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); + struct ggml_tensor * t25 = ggml_mul_mat (ctx, w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); + struct ggml_tensor * t26 = ggml_mul_mat (ctx, w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); + struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); + struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); + struct ggml_tensor * t29 = ggml_mul_mat (ctx, w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); + struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); + cur = t30; + if (enable_checkpointing) { + checkpoints.push_back(cur); + } + } + struct ggml_tensor * t31 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t31, "t31"); assert_shape_2d(t31, n_embd, N*n_batch); + struct ggml_tensor * t32 = ggml_repeat (ctx, norm, t31); set_name(t32, "t32"); assert_shape_2d(t32, n_embd, N*n_batch); + struct ggml_tensor * t33 = ggml_mul (ctx, t32, t31); set_name(t33, "t33"); assert_shape_2d(t33, n_embd, N*n_batch); + struct ggml_tensor * t34 = ggml_mul_mat (ctx, output, t33); set_name(t34, "t34"); assert_shape_2d(t34, n_vocab, N*n_batch); + struct ggml_tensor * t35 = ggml_reshape_3d (ctx, t34, n_vocab, N, n_batch); set_name(t35, "t35"); assert_shape_3d(t35, n_vocab, N, n_batch); + struct ggml_tensor * t36 = ggml_cross_entropy_loss(ctx, t35, targets); set_name(t36, "t36"); assert_shape_1d(t36, 1); + + if (enable_checkpointing) { + checkpoints.push_back(t31); + checkpoints.push_back(t32); + checkpoints.push_back(t33); + checkpoints.push_back(t34); + checkpoints.push_back(t35); + checkpoints.push_back(t36); + } + + ggml_build_forward_expand(gf, t36); + + if (enable_checkpointing) { + ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size()); + } else { + *gb = *gf; + ggml_build_backward_expand(ctx, gf, gb, true); + } + + GGML_ASSERT(alloc != NULL); + + // make sure some tensors are not reallocated by inserting new temporary nodes depending on them + int n_leafs_before = gb->n_leafs; + int n_nodes_before = gb->n_nodes; + struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f); + // output tensors + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one)); + // input gradient + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one)); + GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); + ggml_allocr_alloc(alloc, t36->grad); + // KQ_pos + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one)); + + // make sure base model tensors data cannot be used in viewable operations + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, one)); + for (int il = 0; il < n_layer; ++il) { + struct my_llama_layer & layer = model->layers[il]; + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, one)); + } + + // allocating checkpoints in one block to reduce memory fragmentation + // note: they will be freed in reverse order + for (unsigned int i = 0; i < checkpoints.size(); ++i) { + if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) { + ggml_allocr_alloc(alloc, checkpoints[i]); + } + } + + ggml_allocr_alloc_graph(alloc, gb); + + // remove the additional nodes and leafs + for (int i = n_leafs_before; i < gb->n_leafs; ++i) { + gb->leafs[i] = NULL; + } + for (int i = n_nodes_before; i < gb->n_nodes; ++i) { + gb->nodes[i] = NULL; + } + gb->n_leafs = n_leafs_before; + gb->n_nodes = n_nodes_before; + + *logits = t35; + return t36; +} + +static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) { + // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read + + std::string arch; + + std::vector keybuf; + keybuf.resize(512); + + GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE); + GGML_ASSERT(arch == "llama"); + + uint32_t ftype_u; + GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE); + GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32); + + struct my_llama_hparams hparams; + load_model_hparams_gguf(fctx, &hparams, arch.c_str()); + + // parameters that define tensor shapes must match + GGML_ASSERT(hparams.n_embd == model->hparams.n_embd); + GGML_ASSERT(hparams.n_ff == model->hparams.n_ff); + GGML_ASSERT(hparams.n_head == model->hparams.n_head); + GGML_ASSERT(hparams.n_head_kv == model->hparams.n_head_kv); + GGML_ASSERT(hparams.n_layer == model->hparams.n_layer); + + GGUF_GET_KEY(fctx, lora->hparams.n_rank_tok_embeddings, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_output, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_attention_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_wq, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_Q); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_wk, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_K); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_w1, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_w2, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_w3, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP); + + init_lora(model, lora); + + copy_tensor_by_name(lora->tok_embeddings_a, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_a)); + copy_tensor_by_name(lora->tok_embeddings_b, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_b)); + copy_tensor_by_name(lora->norm_a, f_ggml_ctx, ggml_get_name(lora->norm_a)); + copy_tensor_by_name(lora->norm_b, f_ggml_ctx, ggml_get_name(lora->norm_b)); + copy_tensor_by_name(lora->output_a, f_ggml_ctx, ggml_get_name(lora->output_a)); + copy_tensor_by_name(lora->output_b, f_ggml_ctx, ggml_get_name(lora->output_b)); + + for (uint32_t i = 0; i < lora->layers.size(); ++i) { + auto & layer = lora->layers[i]; + copy_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a)); + copy_tensor_by_name(layer.attention_norm_b, f_ggml_ctx, ggml_get_name(layer.attention_norm_b)); + copy_tensor_by_name(layer.wq_a, f_ggml_ctx, ggml_get_name(layer.wq_a)); + copy_tensor_by_name(layer.wq_b, f_ggml_ctx, ggml_get_name(layer.wq_b)); + copy_tensor_by_name(layer.wk_a, f_ggml_ctx, ggml_get_name(layer.wk_a)); + copy_tensor_by_name(layer.wk_b, f_ggml_ctx, ggml_get_name(layer.wk_b)); + copy_tensor_by_name(layer.wv_a, f_ggml_ctx, ggml_get_name(layer.wv_a)); + copy_tensor_by_name(layer.wv_b, f_ggml_ctx, ggml_get_name(layer.wv_b)); + copy_tensor_by_name(layer.wo_a, f_ggml_ctx, ggml_get_name(layer.wo_a)); + copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b)); + copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a)); + copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b)); + copy_tensor_by_name(layer.w1_a, f_ggml_ctx, ggml_get_name(layer.w1_a)); + copy_tensor_by_name(layer.w1_b, f_ggml_ctx, ggml_get_name(layer.w1_b)); + copy_tensor_by_name(layer.w2_a, f_ggml_ctx, ggml_get_name(layer.w2_a)); + copy_tensor_by_name(layer.w2_b, f_ggml_ctx, ggml_get_name(layer.w2_b)); + copy_tensor_by_name(layer.w3_a, f_ggml_ctx, ggml_get_name(layer.w3_a)); + copy_tensor_by_name(layer.w3_b, f_ggml_ctx, ggml_get_name(layer.w3_b)); + } +} + +static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora) { + const char * arch = "llama"; + enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32; + + std::vector keybuf; + keybuf.resize(512); + auto kv = [arch, &keybuf](const char * key) -> const char * { + snprintf(keybuf.data(), keybuf.size(), key, arch); + return keybuf.data(); + }; + + gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch); + gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype); + + gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.n_ctx); + gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH), model->hparams.n_embd); + gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH), model->hparams.n_ff); + gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT), model->hparams.n_head); + gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV), model->hparams.n_head_kv); + gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT), model->hparams.n_layer); + gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT), model->hparams.n_embd_head()); + gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps); + gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE), model->hparams.rope_freq_base); + gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR), model->hparams.rope_freq_scale); + + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD, lora->hparams.n_rank_tok_embeddings); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, lora->hparams.n_rank_norm); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT, lora->hparams.n_rank_output); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM, lora->hparams.n_rank_attention_norm); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_Q, lora->hparams.n_rank_wq); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_K, lora->hparams.n_rank_wk); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_w1); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_w2); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_w3); + + gguf_add_tensor(fctx, lora->tok_embeddings_a); + gguf_add_tensor(fctx, lora->tok_embeddings_b); + gguf_add_tensor(fctx, lora->norm_a); + gguf_add_tensor(fctx, lora->norm_b); + gguf_add_tensor(fctx, lora->output_a); + gguf_add_tensor(fctx, lora->output_b); + + for (uint32_t i = 0; i < lora->layers.size(); ++i) { + auto & layer = lora->layers[i]; + + gguf_add_tensor(fctx, layer.attention_norm_a); + gguf_add_tensor(fctx, layer.attention_norm_b); + gguf_add_tensor(fctx, layer.wq_a); + gguf_add_tensor(fctx, layer.wq_b); + gguf_add_tensor(fctx, layer.wk_a); + gguf_add_tensor(fctx, layer.wk_b); + gguf_add_tensor(fctx, layer.wv_a); + gguf_add_tensor(fctx, layer.wv_b); + gguf_add_tensor(fctx, layer.wo_a); + gguf_add_tensor(fctx, layer.wo_b); + gguf_add_tensor(fctx, layer.ffn_norm_a); + gguf_add_tensor(fctx, layer.ffn_norm_b); + gguf_add_tensor(fctx, layer.w1_a); + gguf_add_tensor(fctx, layer.w1_b); + gguf_add_tensor(fctx, layer.w2_a); + gguf_add_tensor(fctx, layer.w2_b); + gguf_add_tensor(fctx, layer.w3_a); + gguf_add_tensor(fctx, layer.w3_b); + } +} + +static void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { + std::string train_type = LLM_KV_TRAINING_TYPE_FINETUNE_LORA; + GGUF_GET_KEY(fctx, train_type, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE); + GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_FINETUNE_LORA); + + load_train_state_gguf(fctx, f_ggml_ctx, train); + load_llama_lora_gguf(fctx, f_ggml_ctx, model, lora); +} + +static void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { + gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA); + save_llama_lora_gguf(fctx, model, lora); + save_train_state_gguf(fctx, train); +} + +static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { + struct ggml_context * f_ggml_ctx; + struct gguf_init_params params; + params.no_alloc = false; + params.ctx = &f_ggml_ctx; + struct gguf_context * fctx = gguf_init_from_file(filename, params); + if (fctx == NULL) { + return false; + } + + load_checkpoint_lora_gguf(fctx, f_ggml_ctx, model, lora, train); + + gguf_free(fctx); + return true; +} + +static void save_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { + printf("%s: saving to %s\n", __func__, filename); + struct gguf_context * fctx = gguf_init_empty(); + + save_checkpoint_lora_gguf(fctx, model, lora, train); + + // write file + const bool only_meta = false; + gguf_write_to_file(fctx, filename, only_meta); + gguf_free(fctx); +} + +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + size = 0; + } else { + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, size, 1, fp); + if (ferror(fp)) { + die_fmt("read error: %s", strerror(errno)); + } + if (ret != 1) { + die("unexpectedly reached end of file"); + } + } + + std::uint32_t read_u32() { + std::uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + std::string read_string(std::uint32_t len) { + std::vector chars(len); + read_raw(chars.data(), len); + return std::string(chars.data(), len); + } + + void write_raw(const void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, size, 1, fp); + if (ret != 1) { + die_fmt("write error: %s", strerror(errno)); + } + } + + void write_u32(std::uint32_t val) { + write_raw(&val, sizeof(val)); + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } +}; + +static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, const char * name) { + if (tensor == NULL) { + file->write_u32(0); + file->write_u32(0); + file->write_u32(GGML_TYPE_F32); + file->seek((0-file->tell()) & 31, SEEK_CUR); + return; + } + if (name == NULL) { + name = ggml_get_name(tensor); + } + uint32_t name_len = strlen(name); + uint32_t nd = tensor->n_dims; + uint32_t ne[4] = { (uint32_t)tensor->ne[0], + (uint32_t)tensor->ne[1], + (uint32_t)tensor->ne[2], + (uint32_t)tensor->ne[3] }; + file->write_u32(nd); + file->write_u32(name_len); + file->write_u32(tensor->type); + file->write_raw(ne, sizeof(ne[0]) * nd); + file->write_raw(name, name_len); + file->seek((0-file->tell()) & 31, SEEK_CUR); + file->write_raw(tensor->data, ggml_nbytes(tensor)); +} + +static void save_as_llama_lora(const char * filename, struct my_llama_lora * lora) { + printf("%s: saving to %s\n", __func__, filename); + struct llama_file file(filename, "wb"); + if (file.fp == NULL) { + return; + } + + std::vector tn_buf; + tn_buf.resize(GGML_MAX_NAME); + + auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix); + return tn_buf.data(); + }; + + auto tni = [&tn_buf](const char * key, int bid, const char * suffix) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), key, bid); + std::string s = tn_buf.data(); + snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix); + return tn_buf.data(); + }; + + uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla' + // write_magic + file.write_u32(LLAMA_FILE_MAGIC_LORA); // magic + file.write_u32(1); // version + // write_hparams + file.write_u32(lora->hparams.lora_r); + file.write_u32(lora->hparams.lora_alpha); + // write tensors + write_tensor(&file, lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.loraA")); + write_tensor(&file, lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.loraB")); + write_tensor(&file, lora->norm_a, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraA")); + write_tensor(&file, lora->norm_b, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraB")); + write_tensor(&file, lora->output_a, tn(LLM_TENSOR_OUTPUT, ".weight.loraA")); + write_tensor(&file, lora->output_b, tn(LLM_TENSOR_OUTPUT, ".weight.loraB")); + for (uint32_t i = 0; i < lora->layers.size(); ++i) { + auto & layer = lora->layers[i]; + write_tensor(&file, layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraA")); + write_tensor(&file, layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraB")); + write_tensor(&file, layer.wq_a, tni(LLM_TENSOR_ATTN_Q, i, ".weight.loraA")); + write_tensor(&file, layer.wq_b, tni(LLM_TENSOR_ATTN_Q, i, ".weight.loraB")); + write_tensor(&file, layer.wk_a, tni(LLM_TENSOR_ATTN_K, i, ".weight.loraA")); + write_tensor(&file, layer.wk_b, tni(LLM_TENSOR_ATTN_K, i, ".weight.loraB")); + write_tensor(&file, layer.wv_a, tni(LLM_TENSOR_ATTN_V, i, ".weight.loraA")); + write_tensor(&file, layer.wv_b, tni(LLM_TENSOR_ATTN_V, i, ".weight.loraB")); + write_tensor(&file, layer.wo_a, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraA")); + write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB")); + write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA")); + write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB")); + write_tensor(&file, layer.w1_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA")); + write_tensor(&file, layer.w1_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB")); + write_tensor(&file, layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA")); + write_tensor(&file, layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB")); + write_tensor(&file, layer.w3_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA")); + write_tensor(&file, layer.w3_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB")); + } +} + +struct train_params { + struct train_params_common common; + + const char * fn_model_base; + const char * fn_lora_out; + + bool only_write_lora; + + float f_norm_rms_eps; + float rope_freq_base; + float rope_freq_scale; + + bool custom_f_norm_rms_eps; + bool custom_rope_freq_base; + bool custom_rope_freq_scale; + + int32_t lora_r; + int32_t lora_alpha; + bool custom_lora_alpha; + + uint32_t n_rank_attention_norm; + uint32_t n_rank_wq; + uint32_t n_rank_wk; + uint32_t n_rank_wv; + uint32_t n_rank_wo; + uint32_t n_rank_ffn_norm; + uint32_t n_rank_w1; + uint32_t n_rank_w2; + uint32_t n_rank_w3; + uint32_t n_rank_tok_embeddings; + uint32_t n_rank_norm; + uint32_t n_rank_output; + + bool custom_n_rank_attention_norm; + bool custom_n_rank_wq; + bool custom_n_rank_wk; + bool custom_n_rank_wv; + bool custom_n_rank_wo; + bool custom_n_rank_ffn_norm; + bool custom_n_rank_w1; + bool custom_n_rank_w2; + bool custom_n_rank_w3; + bool custom_n_rank_tok_embeddings; + bool custom_n_rank_norm; + bool custom_n_rank_output; +}; + +static struct train_params get_default_train_params() { + struct train_params params; + params.common = get_default_train_params_common(); + params.fn_model_base = ""; + params.fn_lora_out = "ggml-lora-ITERATION-f32.gguf"; + + params.only_write_lora = false; + + params.f_norm_rms_eps = 1e-5f; + params.rope_freq_base = 10000.0f; + params.rope_freq_scale = 1.0f; + + params.custom_f_norm_rms_eps = false; + params.custom_rope_freq_base = false; + params.custom_rope_freq_scale = false; + + params.lora_r = 4; + params.lora_alpha = 4; + params.custom_lora_alpha = false; + + params.n_rank_attention_norm = 1; + params.n_rank_wq = 4; + params.n_rank_wk = 4; + params.n_rank_wv = 4; + params.n_rank_wo = 4; + params.n_rank_ffn_norm = 1; + params.n_rank_w1 = 4; + params.n_rank_w2 = 4; + params.n_rank_w3 = 4; + params.n_rank_tok_embeddings = 4; + params.n_rank_norm = 1; + params.n_rank_output = 4; + + params.custom_n_rank_attention_norm = false; + params.custom_n_rank_wq = false; + params.custom_n_rank_wk = false; + params.custom_n_rank_wv = false; + params.custom_n_rank_wo = false; + params.custom_n_rank_ffn_norm = false; + params.custom_n_rank_w1 = false; + params.custom_n_rank_w2 = false; + params.custom_n_rank_w3 = false; + params.custom_n_rank_tok_embeddings = false; + params.custom_n_rank_norm = false; + params.custom_n_rank_output = false; + + return params; +} + +static void train_print_usage(int argc, char ** argv, const struct train_params * params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + + fprintf(stderr, " --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base); + fprintf(stderr, " --lora-out FNAME path to save llama lora (default '%s')\n", params->fn_lora_out); + fprintf(stderr, " --only-write-lora only save llama lora, don't do any training. use this if you only want to convert a checkpoint to a lora adapter.\n"); + fprintf(stderr, " --norm-rms-eps F RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps); + fprintf(stderr, " --rope-freq-base F Frequency base for ROPE (default %f)\n", params->rope_freq_base); + fprintf(stderr, " --rope-freq-scale F Frequency scale for ROPE (default %f)\n", params->rope_freq_scale); + fprintf(stderr, " --lora-alpha N LORA alpha : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_alpha); + fprintf(stderr, " --lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default %d)\n", params->lora_r); + fprintf(stderr, " --rank-att-norm N LORA rank for attention norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n"); + fprintf(stderr, " --rank-ffn-norm N LORA rank for feed-forward norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n"); + fprintf(stderr, " --rank-out-norm N LORA rank for output norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n"); + fprintf(stderr, " --rank-tok-embd N LORA rank for token embeddings tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-out N LORA rank for output tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-wq N LORA rank for wq tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-w1 N LORA rank for w1 tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-w2 N LORA rank for w2 tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-w3 N LORA rank for w3 tensor, overrides default rank.\n"); + + print_common_train_usage(argc, argv, ¶ms->common); +} + +static bool train_params_parse(int argc, char ** argv, struct train_params * params) { + bool invalid_param = false; + std::string arg; + struct train_params default_params = get_default_train_params(); + const std::string arg_prefix = "--"; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + + if (consume_common_train_arg(argc, argv, &i, ¶ms->common, &invalid_param)) { + if (invalid_param) { + break; + } else if (params->common.print_usage) { + train_print_usage(argc, argv, &default_params); + exit(0); + } + } else if (arg == "--model-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_model_base = argv[i]; + } else if (arg == "--lora-out") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_lora_out = argv[i]; + } else if (arg == "--only-write-lora") { + params->only_write_lora = true; + } else if (arg == "--norm-rms-eps") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->f_norm_rms_eps = std::stof(argv[i]); + params->custom_f_norm_rms_eps = true; + } else if (arg == "--rope-freq-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->rope_freq_base = std::stof(argv[i]); + params->custom_rope_freq_base = true; + } else if (arg == "--rope-freq-scale") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->rope_freq_scale = std::stof(argv[i]); + params->custom_rope_freq_scale = true; + } else if (arg == "--lora-alpha") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->lora_alpha = std::stoi(argv[i]); + params->custom_lora_alpha = true; + } else if (arg == "--lora-r") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->lora_r = std::stoi(argv[i]); + } else if (arg == "--rank-att-norm") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_attention_norm = std::stoi(argv[i]); + params->custom_n_rank_attention_norm = true; + } else if (arg == "--rank-ffn-norm") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_ffn_norm = std::stoi(argv[i]); + params->custom_n_rank_ffn_norm = true; + } else if (arg == "--rank-out-norm") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_norm = std::stoi(argv[i]); + params->custom_n_rank_norm = true; + } else if (arg == "--rank-tok-embd") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_tok_embeddings = std::stoi(argv[i]); + params->custom_n_rank_tok_embeddings = true; + } else if (arg == "--rank-out") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_output = std::stoi(argv[i]); + params->custom_n_rank_output = true; + } else if (arg == "--rank-wq") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_wq = std::stoi(argv[i]); + params->custom_n_rank_wq = true; + } else if (arg == "--rank-wk") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_wk = std::stoi(argv[i]); + params->custom_n_rank_wk = true; + } else if (arg == "--rank-wv") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_wv = std::stoi(argv[i]); + params->custom_n_rank_wv = true; + } else if (arg == "--rank-wo") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_wo = std::stoi(argv[i]); + params->custom_n_rank_wo = true; + } else if (arg == "--rank-w1") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_w1 = std::stoi(argv[i]); + params->custom_n_rank_w1 = true; + } else if (arg == "--rank-w2") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_w2 = std::stoi(argv[i]); + params->custom_n_rank_w2 = true; + } else if (arg == "--rank-w3") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rank_w3 = std::stoi(argv[i]); + params->custom_n_rank_w3 = true; + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + train_print_usage(argc, argv, &default_params); + exit(1); + } + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + train_print_usage(argc, argv, &default_params); + exit(1); + } + finish_processing_train_args(¶ms->common); + return true; +} + +struct save_train_files_data { + const char * fn_checkpoint_out; + const char * fn_lora_out; + const char * pattern_fn_it; + const char * fn_latest; + struct my_llama_model * model; + struct my_llama_lora * lora; +}; + +static void save_train_files(void * vdata, struct train_state * train) { + struct save_train_files_data * data = (struct save_train_files_data *) vdata; + + int64_t iter = train->opt->iter; + + if (strlen(data->fn_checkpoint_out) > 0) { + save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->model, data->lora, train); + save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->model, data->lora, train); + } + if (strlen(data->fn_lora_out) > 0) { + save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->lora); + save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->lora); + } +} + +static int64_t get_parameter_count(struct my_llama_lora* lora) { + int64_t nx = 0; + nx += ggml_nelements(lora->tok_embeddings_a); + nx += ggml_nelements(lora->tok_embeddings_b); + nx += ggml_nelements(lora->norm_a); + nx += ggml_nelements(lora->norm_b); + nx += ggml_nelements(lora->output_a); + nx += ggml_nelements(lora->output_b); + + for (uint32_t i = 0; i < lora->layers.size(); ++i) { + auto & layer = lora->layers[i]; + nx += ggml_nelements(layer.attention_norm_a); + nx += ggml_nelements(layer.attention_norm_b); + nx += ggml_nelements(layer.wq_a); + nx += ggml_nelements(layer.wq_b); + nx += ggml_nelements(layer.wk_a); + nx += ggml_nelements(layer.wk_b); + nx += ggml_nelements(layer.wv_a); + nx += ggml_nelements(layer.wv_b); + nx += ggml_nelements(layer.wo_a); + nx += ggml_nelements(layer.wo_b); + nx += ggml_nelements(layer.ffn_norm_a); + nx += ggml_nelements(layer.ffn_norm_b); + nx += ggml_nelements(layer.w1_a); + nx += ggml_nelements(layer.w1_b); + nx += ggml_nelements(layer.w2_a); + nx += ggml_nelements(layer.w2_b); + nx += ggml_nelements(layer.w3_a); + nx += ggml_nelements(layer.w3_b); + } + return nx; +} + +int main(int argc, char ** argv) { + struct train_params params = get_default_train_params(); + + if (!train_params_parse(argc, argv, ¶ms)) { + return 1; + } + + if (params.common.seed == LLAMA_DEFAULT_SEED) { + params.common.seed = time(NULL); + } + printf("%s: seed: %u\n", __func__, params.common.seed); + srand(params.common.seed); + + struct llama_model_params llama_mparams = llama_model_default_params(); + llama_mparams.vocab_only = false; + + printf("%s: model base = '%s'\n", __func__, params.fn_model_base); + struct llama_model * lmodel = llama_load_model_from_file(params.fn_model_base, llama_mparams); + + struct llama_context_params llama_cparams = llama_context_default_params(); + struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_cparams); + + struct my_llama_model model; + init_model(lmodel, &model, params.fn_model_base, params.common.n_ctx); + + struct my_llama_lora lora; + + struct train_state * train = init_train_state(); + struct ggml_opt_context * opt = train->opt; + + // set params from command line + if (params.custom_f_norm_rms_eps) { + model.hparams.f_norm_rms_eps = params.f_norm_rms_eps; + } + if (params.custom_rope_freq_base) { + model.hparams.rope_freq_base = params.rope_freq_base; + } + if (params.custom_rope_freq_scale) { + model.hparams.rope_freq_scale = params.rope_freq_scale; + } + lora.hparams.lora_r = params.lora_r; + lora.hparams.lora_alpha = params.custom_lora_alpha ? params.lora_alpha : params.lora_r; + uint32_t n_rank_attention_norm = params.custom_n_rank_attention_norm ? params.n_rank_attention_norm : 1; + uint32_t n_rank_wq = params.custom_n_rank_wq ? params.n_rank_wq : params.lora_r; + uint32_t n_rank_wk = params.custom_n_rank_wk ? params.n_rank_wk : params.lora_r; + uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r; + uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r; + uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1; + uint32_t n_rank_w1 = params.custom_n_rank_w1 ? params.n_rank_w1 : params.lora_r; + uint32_t n_rank_w2 = params.custom_n_rank_w2 ? params.n_rank_w2 : params.lora_r; + uint32_t n_rank_w3 = params.custom_n_rank_w3 ? params.n_rank_w3 : params.lora_r; + uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r; + uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1; + uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r; + lora.hparams.n_rank_attention_norm = n_rank_attention_norm; + lora.hparams.n_rank_wq = n_rank_wq; + lora.hparams.n_rank_wk = n_rank_wk; + lora.hparams.n_rank_wv = n_rank_wv; + lora.hparams.n_rank_wo = n_rank_wo; + lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm; + lora.hparams.n_rank_w1 = n_rank_w1; + lora.hparams.n_rank_w2 = n_rank_w2; + lora.hparams.n_rank_w3 = n_rank_w3; + lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings; + lora.hparams.n_rank_norm = n_rank_norm; + lora.hparams.n_rank_output = n_rank_output; + + // set opt params from command line + opt->params = ggml_opt_default_params(GGML_OPT_ADAM); + opt->params.print_forward_graph = false; + opt->params.print_backward_graph = false; + opt->params.n_threads = params.common.n_threads; + opt->params.past = params.common.opt_past; + opt->params.delta = params.common.opt_delta; + opt->params.max_no_improvement = params.common.opt_max_no_improvement; + opt->params.n_gradient_accumulation = params.common.n_gradient_accumulation; + opt->params.adam.n_iter = params.common.adam_n_iter; + opt->params.adam.sched = 1.0f; + opt->params.adam.alpha = params.common.adam_alpha; + opt->params.adam.decay = params.common.adam_decay; + opt->params.adam.decay_min_ndim = params.common.adam_decay_min_ndim; + opt->params.adam.beta1 = params.common.adam_beta1; + opt->params.adam.beta2 = params.common.adam_beta2; + opt->params.adam.gclip = params.common.adam_gclip; + opt->params.adam.eps_f = params.common.adam_eps_f; + + ggml_allocr * alloc = NULL; + + printf("%s: init model\n", __func__); + bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train); + + if (existed) { + // overwrite last n_ctx with user provided n_ctx + if (params.common.custom_n_ctx) { + model.hparams.n_ctx = params.common.n_ctx; + } + + const bool opt_param_count_changed = ( + (lora.hparams.n_rank_attention_norm != n_rank_attention_norm) + || (lora.hparams.n_rank_wq != n_rank_wq) + || (lora.hparams.n_rank_wk != n_rank_wk) + || (lora.hparams.n_rank_wv != n_rank_wv) + || (lora.hparams.n_rank_wo != n_rank_wo) + || (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm) + || (lora.hparams.n_rank_w1 != n_rank_w1) + || (lora.hparams.n_rank_w2 != n_rank_w2) + || (lora.hparams.n_rank_w3 != n_rank_w3) + || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings) + || (lora.hparams.n_rank_norm != n_rank_norm) + || (lora.hparams.n_rank_output != n_rank_output) + ); + + const bool opt_past_changed = opt->params.past != params.common.opt_past; + + if (opt_param_count_changed) { + print_lora_params(&lora.hparams); + die("Provided rank differs from checkpoint file. To use different rank start finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting."); + // need to discard previous optimizer gradient statistics and opt_init with new shapes + // TODO + } + if (opt_past_changed) { + die("Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting"); + // need to discard previous optimizer past function value statistics and opt_init with new shapes + // TODO + } + } else { // existed == false + init_lora(&model, &lora); + randomize_lora(&lora, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f); + if (!params.only_write_lora) { + ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&lora)); + } + } + opt->iter = train->train_its; + + print_params(&model.hparams); + print_lora_params(&lora.hparams); + printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) train->train_its); + printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples); + printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens); + printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs); + printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f)); + + if (params.only_write_lora) { + save_train_files_data save_data; + save_data.fn_checkpoint_out = ""; + save_data.fn_lora_out = params.fn_lora_out; + save_data.pattern_fn_it = params.common.pattern_fn_it; + save_data.fn_latest = params.common.fn_latest; + save_data.model = &model; + save_data.lora = &lora; + + save_train_files(&save_data, train); + + free_train_state(train); + ggml_free(lora.ctx); + llama_free(lctx); + llama_free_model(lmodel); + return 0; + } + + printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f)); + printf("%s: opt iter %d\n", __func__, opt->iter); + + int n_tokens = model.hparams.n_ctx; + int n_vocab = model.hparams.n_vocab; + int n_batch = params.common.n_batch; + + + std::vector mem_input_data; + std::vector mem_compute_data; + + // context for input tensors without their data + struct ggml_init_params ctx_input_params = { + ggml_tensor_overhead() * 2, // mem_size + NULL, // mem_buffer + true, // no_alloc + }; + struct ggml_context * ctx_input = ggml_init(ctx_input_params); + + // the input tensors + struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch); + struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); + + // measure required memory for input tensors + alloc = ggml_allocr_new_measure(tensor_alignment); + ggml_allocr_alloc(alloc, tokens_input); + ggml_allocr_alloc(alloc, target_probs); + size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment; + ggml_allocr_free(alloc); + printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); + + // allocate input tensors + mem_input_data.resize(max_input_size); + alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment); + ggml_allocr_alloc(alloc, tokens_input); + ggml_allocr_alloc(alloc, target_probs); + ggml_allocr_free(alloc); + + // context for compute tensors without their data + size_t estimated_compute_size_wo_data = ( + ggml_tensor_overhead()*GGML_MAX_NODES*2 + + (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*( + params.common.use_checkpointing ? 3 : 2 + ) + ); + struct ggml_init_params ctx_compute_params = { + estimated_compute_size_wo_data, // mem_size + NULL, // mem_buffer + true, // no_alloc + }; + struct ggml_context * ctx_compute = NULL; + + struct ggml_tensor * loss = NULL; + struct ggml_tensor * logits = NULL; + + struct ggml_cgraph * gf = NULL; + struct ggml_cgraph * gb = NULL; + struct ggml_cgraph * gb_tmp = NULL; + + // measure required memory for compute tensors + size_t best_compute_size = SIZE_MAX; + enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT; + // find best evaluation order + for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { + ctx_compute = ggml_init(ctx_compute_params); + alloc = ggml_allocr_new_measure(tensor_alignment); + gf = ggml_new_graph(ctx_compute); + gf->order = (enum ggml_cgraph_eval_order) order; + gb = ggml_new_graph(ctx_compute); + gb_tmp = params.common.use_checkpointing + ? ggml_new_graph(ctx_compute) + : NULL; + loss = llama_build_lora_finetune_graphs( + &model, &lora, alloc, ctx_compute, + gf, gb, gb_tmp, + &logits, tokens_input, target_probs, + n_tokens, n_batch, + params.common.use_flash, + params.common.use_checkpointing + ); + size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment; + if (max_compute_size < best_compute_size) { + best_compute_size = max_compute_size; + best_order = gf->order; + } + ggml_allocr_free(alloc); + ggml_free(ctx_compute); + } + size_t max_compute_size = best_compute_size; + printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f)); + printf("%s: evaluation order = %s\n", __func__, + (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" : + (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" : + "invalid"); + + // allocate compute tensors + mem_compute_data.resize(max_compute_size); + ctx_compute = ggml_init(ctx_compute_params); + alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); + gf = ggml_new_graph(ctx_compute); + gf->order = best_order; + gb = ggml_new_graph(ctx_compute); + gb_tmp = params.common.use_checkpointing + ? ggml_new_graph(ctx_compute) + : NULL; + loss = llama_build_lora_finetune_graphs( + &model, &lora, alloc, ctx_compute, + gf, gb, gb_tmp, + &logits, tokens_input, target_probs, + n_tokens, n_batch, + params.common.use_flash, + params.common.use_checkpointing + ); + ggml_allocr_free(alloc); + + // tokenize data + std::vector train_tokens; + std::vector train_samples_begin; + std::vector train_samples_size; + printf("%s: tokenize training data\n", __func__); + tokenize_file(lctx, + params.common.fn_train_data, + params.common.sample_start, + params.common.include_sample_start, + params.common.overlapping_samples, + n_tokens, + train_tokens, + train_samples_begin, + train_samples_size); + GGML_ASSERT(train_samples_begin.size() == train_samples_size.size()); + + printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size()); + + std::vector token_noccurs; + token_noccurs.resize(model.hparams.n_vocab, 0); + for (unsigned int i = 0; i < train_tokens.size(); ++i) { + ++token_noccurs[train_tokens[i]]; + } + int n_unique_tokens = 0; + for (unsigned int i = 0; i < token_noccurs.size(); ++i) { + if (token_noccurs[i] == 0) continue; + ++n_unique_tokens; + } + printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens); + + size_t shuffle_samples_hash = compute_samples_hash(params.common.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size()); + const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size()); + if (changed_train_data) { + printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__); + } + if (params.common.force_reshuffle) { + printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__); + } + if ((train->shuffle_rng_state_current == "") || changed_train_data || params.common.force_reshuffle) { + train->shuffle_rng_state_current = mt19937_seed_to_state(params.common.seed); + train->shuffle_sample_count = train_samples_size.size(); + train->shuffle_next_sample = 0; + train->shuffle_samples_hash = shuffle_samples_hash; + } + std::vector train_shuffled_samples_offs; + std::vector train_shuffled_samples_begin; + std::vector train_shuffled_samples_size; + train_shuffled_samples_offs.resize(train_samples_begin.size()); + train_shuffled_samples_begin.resize(train_samples_begin.size()); + train_shuffled_samples_size.resize(train_samples_size.size()); + train->shuffle_rng_state_next = shuffle_samples( + train->shuffle_rng_state_current, + train_shuffled_samples_offs.data(), + train_shuffled_samples_begin.data(), + train_shuffled_samples_size.data(), + train_samples_begin.data(), + train_samples_size.data(), + train_samples_size.size()); + + printf("%s: begin training\n", __func__); + + save_train_files_data save_data; + save_data.fn_checkpoint_out = params.common.fn_checkpoint_out; + save_data.fn_lora_out = params.fn_lora_out; + save_data.pattern_fn_it = params.common.pattern_fn_it; + save_data.fn_latest = params.common.fn_latest; + save_data.model = &model; + save_data.lora = &lora; + + struct train_opt_callback_data opt_cb_data; + opt_cb_data.params = ¶ms.common; + opt_cb_data.train = train; + opt_cb_data.save_cb = &save_train_files; + opt_cb_data.save_data = &save_data; + opt_cb_data.lctx = lctx; + opt_cb_data.last_save_iter = opt->iter; + opt_cb_data.tokens_data = train_tokens.data(); + opt_cb_data.tokens_size = train_tokens.size(); + opt_cb_data.samples_begin = train_samples_begin.data(); + opt_cb_data.samples_size = train_samples_size.data(); + opt_cb_data.shuffled_samples_offs = train_shuffled_samples_offs.data(); + opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data(); + opt_cb_data.shuffled_samples_size = train_shuffled_samples_size.data(); + opt_cb_data.samples_count = train_samples_size.size(); + opt_cb_data.tokens_input = tokens_input; + opt_cb_data.target_probs = target_probs; + opt_cb_data.first_iter = opt->iter; + opt_cb_data.first_epoch = train->train_epochs; + opt_cb_data.iter_at_last_epoch = -1; + opt_cb_data.last_time = ggml_time_ms(); + opt_cb_data.millis_per_iter = 0.0; + + // measure required memory for work buffer + size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE; + printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f)); + + // context for work buffer + struct ggml_init_params ctx_work_params = { + max_work_size, // mem_size + NULL, // mem_buffer + false, // no_alloc + }; + struct ggml_context * ctx_work = ggml_init(ctx_work_params); + + int64_t t0 = ggml_time_ms(); + + ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data); + + ggml_free(ctx_work); + ggml_free(ctx_compute); + ggml_free(ctx_input); + + int64_t t1 = ggml_time_ms(); + printf("%s: total training time: ", __func__); + print_duration((double) (t1 - t0)); + printf("\n"); + + int new_iters = opt->iter - opt_cb_data.last_save_iter; + if (new_iters > 0) { + train->train_its += new_iters; + train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens; + + save_train_files(&save_data, train); + opt_cb_data.last_save_iter = opt->iter; + } + + ggml_free(opt->ctx); + free_train_state(train); + ggml_free(lora.ctx); + llama_free(lctx); + llama_free_model(lmodel); + return 0; +} diff --git a/examples/gptneox-wip/falcon-main.cpp b/examples/gptneox-wip/falcon-main.cpp index 7f9a1620b60bf7e70136233c6befc8021116f2b9..e9197f6b51b418b771dff6d8c5ed7c476741bfe1 100644 --- a/examples/gptneox-wip/falcon-main.cpp +++ b/examples/gptneox-wip/falcon-main.cpp @@ -367,10 +367,10 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_ keyidx = gguf_find_key(ggufctx, "general.architecture"); if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "general.file_type"); - if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model file type = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository"); + keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository"); if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } } diff --git a/examples/gptneox-wip/gptneox-main.cpp b/examples/gptneox-wip/gptneox-main.cpp index 55eba0cdcfdfb78885c7ef7852d8fdcb215d324c..b76bafaa8c51f311010f1dcc0a31da3dc2ff8cf3 100644 --- a/examples/gptneox-wip/gptneox-main.cpp +++ b/examples/gptneox-wip/gptneox-main.cpp @@ -380,10 +380,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2 keyidx = gguf_find_key(ggufctx, "general.architecture"); if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "general.file_type"); - if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model file type = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository"); + keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository"); if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } } diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d02824bfa8d2fceeb032364cb2de3b1725150e41 --- /dev/null +++ b/examples/llama-bench/README.md @@ -0,0 +1,271 @@ +# llama.cpp/example/llama-bench + +Performance testing tool for llama.cpp. + +## Table of contents + +1. [Syntax](#syntax) +2. [Examples](#examples) + 1. [Text generation with different models](#text-generation-with-different-models) + 2. [Prompt processing with different batch sizes](#prompt-processing-with-different-batch-sizes) + 3. [Different numbers of threads](#different-numbers-of-threads) + 4. [Different numbers of layers offloaded to the GPU](#different-numbers-of-layers-offloaded-to-the-gpu) +3. [Output formats](#output-formats) + 1. [Markdown](#markdown) + 2. [CSV](#csv) + 3. [JSON](#json) + 4. [SQL](#sql) + +## Syntax + +``` +usage: ./llama-bench [options] + +options: + -h, --help + -m, --model (default: models/7B/ggml-model-q4_0.gguf) + -p, --n-prompt (default: 512) + -n, --n-gen (default: 128) + -b, --batch-size (default: 512) + --memory-f32 <0|1> (default: 0) + -t, --threads (default: 16) + -ngl N, --n-gpu-layers (default: 99) + -mg i, --main-gpu (default: 0) + -mmq, --mul-mat-q <0|1> (default: 1) + -ts, --tensor_split + -r, --repetitions (default: 5) + -o, --output (default: md) + -v, --verbose (default: 0) + +Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. +``` + +llama-bench can perform two types of tests: + +- Prompt processing (pp): processing a prompt in batches (`-p`) +- Text generation (tg): generating a sequence of tokens (`-n`) + +With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`). + +Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition. + +For a description of the other options, see the [main example](../main/README.md). + +## Examples + +### Text generation with different models + +```sh +$ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512 +``` + +| model | size | params | backend | ngl | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 132.19 ± 0.55 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 256 | 129.37 ± 0.54 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 512 | 123.83 ± 0.25 | +| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 128 | 82.17 ± 0.31 | +| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 256 | 80.74 ± 0.23 | +| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 512 | 78.08 ± 0.07 | + +### Prompt processing with different batch sizes + +```sh +$ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024 +``` + +| model | size | params | backend | ngl | n_batch | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 128 | pp 1024 | 1436.51 ± 3.66 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 256 | pp 1024 | 1932.43 ± 23.48 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 512 | pp 1024 | 2254.45 ± 15.59 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 1024 | pp 1024 | 2498.61 ± 13.58 | + +### Different numbers of threads + +```sh +$ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32 +``` + +| model | size | params | backend | threads | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | ---------: | ---------- | ---------------: | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | pp 64 | 6.17 ± 0.07 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | tg 16 | 4.05 ± 0.02 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | pp 64 | 12.31 ± 0.13 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | tg 16 | 7.80 ± 0.07 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | pp 64 | 23.18 ± 0.06 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | tg 16 | 12.22 ± 0.07 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | pp 64 | 32.29 ± 1.21 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | tg 16 | 16.71 ± 0.66 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | pp 64 | 33.52 ± 0.03 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | tg 16 | 15.32 ± 0.05 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | pp 64 | 59.00 ± 1.11 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | tg 16 | 16.41 ± 0.79 || + +### Different numbers of layers offloaded to the GPU + +```sh +$ ./llama-bench -ngl 10,20,30,31,32,33,34,35 +``` + +| model | size | params | backend | ngl | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | pp 512 | 373.36 ± 2.25 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | tg 128 | 13.45 ± 0.93 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | pp 512 | 472.65 ± 1.25 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | tg 128 | 21.36 ± 1.94 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | pp 512 | 631.87 ± 11.25 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | tg 128 | 40.04 ± 1.82 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | pp 512 | 657.89 ± 5.08 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | tg 128 | 48.19 ± 0.81 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | pp 512 | 688.26 ± 3.29 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | tg 128 | 54.78 ± 0.65 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | pp 512 | 704.27 ± 2.24 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | tg 128 | 60.62 ± 1.76 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | pp 512 | 881.34 ± 5.40 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | tg 128 | 71.76 ± 0.23 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 | + +## Output formats + +By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option. + +### Markdown + +```sh +$ ./llama-bench -o md +``` + +| model | size | params | backend | ngl | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | pp 512 | 2368.80 ± 93.24 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 131.42 ± 0.59 | + +### CSV + +```sh +$ ./llama-bench -o csv +``` + +```csv +build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts +"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961" +"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342" +``` + +### JSON + +```sh +$ ./llama-bench -o json +``` + +```json +[ + { + "build_commit": "3469684", + "build_number": 1275, + "cuda": true, + "opencl": false, + "metal": false, + "gpu_blas": true, + "blas": true, + "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K", + "gpu_info": "NVIDIA GeForce RTX 3090 Ti", + "model_filename": "models/7B/ggml-model-q4_0.gguf", + "model_type": "llama 7B mostly Q4_0", + "model_size": 3825065984, + "model_n_params": 6738415616, + "n_batch": 512, + "n_threads": 16, + "f16_kv": true, + "n_gpu_layers": 99, + "main_gpu": 0, + "mul_mat_q": true, + "tensor_split": "0.00", + "n_prompt": 512, + "n_gen": 0, + "test_time": "2023-09-23T12:09:57Z", + "avg_ns": 212365953, + "stddev_ns": 985423, + "avg_ts": 2410.974041, + "stddev_ts": 11.163766, + "samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ], + "samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ] + }, + { + "build_commit": "3469684", + "build_number": 1275, + "cuda": true, + "opencl": false, + "metal": false, + "gpu_blas": true, + "blas": true, + "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K", + "gpu_info": "NVIDIA GeForce RTX 3090 Ti", + "model_filename": "models/7B/ggml-model-q4_0.gguf", + "model_type": "llama 7B mostly Q4_0", + "model_size": 3825065984, + "model_n_params": 6738415616, + "n_batch": 512, + "n_threads": 16, + "f16_kv": true, + "n_gpu_layers": 99, + "main_gpu": 0, + "mul_mat_q": true, + "tensor_split": "0.00", + "n_prompt": 0, + "n_gen": 128, + "test_time": "2023-09-23T12:09:59Z", + "avg_ns": 977425219, + "stddev_ns": 9268593, + "avg_ts": 130.965708, + "stddev_ts": 1.238924, + "samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ], + "samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ] + } +] +``` + +### SQL + +SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database. + +```sh +$ ./llama-bench -o sql +``` + +```sql +CREATE TABLE IF NOT EXISTS test ( + build_commit TEXT, + build_number INTEGER, + cuda INTEGER, + opencl INTEGER, + metal INTEGER, + gpu_blas INTEGER, + blas INTEGER, + cpu_info TEXT, + gpu_info TEXT, + model_filename TEXT, + model_type TEXT, + model_size INTEGER, + model_n_params INTEGER, + n_batch INTEGER, + n_threads INTEGER, + f16_kv INTEGER, + n_gpu_layers INTEGER, + main_gpu INTEGER, + mul_mat_q INTEGER, + tensor_split TEXT, + n_prompt INTEGER, + n_gen INTEGER, + test_time TEXT, + avg_ns INTEGER, + stddev_ns INTEGER, + avg_ts REAL, + stddev_ts REAL +); + +INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634'); +INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692'); +``` diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 34ddfde39d29575fa581575fafbf2a2bfd90c6f6..a04115c962655ac70a3de0cd721537ecf40d3f58 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -132,7 +132,6 @@ struct cmd_params { std::vector n_gpu_layers; std::vector main_gpu; std::vector mul_mat_q; - std::vector low_vram; std::vector> tensor_split; int reps; bool verbose; @@ -149,7 +148,6 @@ static const cmd_params cmd_params_defaults = { /* n_gpu_layers */ {99}, /* main_gpu */ {0}, /* mul_mat_q */ {true}, - /* low_vram */ {false}, /* tensor_split */ {{}}, /* reps */ 5, /* verbose */ false, @@ -167,9 +165,8 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str()); printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); - printf(" -ngl N, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); - printf(" -mg i, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); - printf(" -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str()); + printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); printf(" -ts, --tensor_split \n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); @@ -255,13 +252,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.main_gpu = split(argv[i], split_delim); - } else if (arg == "-lv" || arg == "--low-vram") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = split(argv[i], split_delim); - params.low_vram.insert(params.low_vram.end(), p.begin(), p.end()); } else if (arg == "-mmq" || arg == "--mul-mat-q") { if (++i >= argc) { invalid_param = true; @@ -336,7 +326,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; } - if (params.low_vram.empty()) { params.low_vram = cmd_params_defaults.low_vram; } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } @@ -353,21 +342,34 @@ struct cmd_params_instance { int n_gpu_layers; int main_gpu; bool mul_mat_q; - bool low_vram; std::array tensor_split; - llama_context_params to_llama_params() const { - llama_context_params lparams = llama_context_default_params(); - lparams.n_ctx = n_prompt + n_gen; - lparams.n_batch = n_batch; - lparams.f16_kv = !f32_kv; - lparams.n_gpu_layers = n_gpu_layers; - lparams.main_gpu = main_gpu; - lparams.mul_mat_q = mul_mat_q; - lparams.low_vram = low_vram; - lparams.tensor_split = tensor_split.data(); + llama_model_params to_llama_mparams() const { + llama_model_params mparams = llama_model_default_params(); + + mparams.n_gpu_layers = n_gpu_layers; + mparams.main_gpu = main_gpu; + mparams.tensor_split = tensor_split.data(); + + return mparams; + } + + bool equal_mparams(const cmd_params_instance & other) const { + return model == other.model && + n_gpu_layers == other.n_gpu_layers && + main_gpu == other.main_gpu && + tensor_split == other.tensor_split; + } + + llama_context_params to_llama_cparams() const { + llama_context_params cparams = llama_context_default_params(); + + cparams.n_ctx = n_prompt + n_gen; + cparams.n_batch = n_batch; + cparams.f16_kv = !f32_kv; + cparams.mul_mat_q = mul_mat_q; - return lparams; + return cparams; } }; @@ -375,13 +377,12 @@ static std::vector get_cmd_params_instances_int(const cmd_p std::vector instances; for (const auto & m : params.model) - for (const auto & nb : params.n_batch) - for (const auto & fk : params.f32_kv) for (const auto & nl : params.n_gpu_layers) for (const auto & mg : params.main_gpu) - for (const auto & mmq : params.mul_mat_q) - for (const auto & lv : params.low_vram) for (const auto & ts : params.tensor_split) + for (const auto & nb : params.n_batch) + for (const auto & fk : params.f32_kv) + for (const auto & mmq : params.mul_mat_q) for (const auto & nt : params.n_threads) { cmd_params_instance instance = { /* .model = */ m, @@ -393,7 +394,6 @@ static std::vector get_cmd_params_instances_int(const cmd_p /* .n_gpu_layers = */ nl, /* .main_gpu = */ mg, /* .mul_mat_q = */ mmq, - /* .low_vram = */ lv, /* .tensor_split = */ ts, }; instances.push_back(instance); @@ -404,6 +404,56 @@ static std::vector get_cmd_params_instances_int(const cmd_p static std::vector get_cmd_params_instances(const cmd_params & params) { std::vector instances; +#if 1 + // this ordering minimizes the number of times that each model needs to be reloaded + for (const auto & m : params.model) + for (const auto & nl : params.n_gpu_layers) + for (const auto & mg : params.main_gpu) + for (const auto & ts : params.tensor_split) + for (const auto & nb : params.n_batch) + for (const auto & fk : params.f32_kv) + for (const auto & mmq : params.mul_mat_q) + for (const auto & nt : params.n_threads) { + for (const auto & n_prompt : params.n_prompt) { + if (n_prompt == 0) { + continue; + } + cmd_params_instance instance = { + /* .model = */ m, + /* .n_prompt = */ n_prompt, + /* .n_gen = */ 0, + /* .n_batch = */ nb, + /* .f32_kv = */ fk, + /* .n_threads = */ nt, + /* .n_gpu_layers = */ nl, + /* .main_gpu = */ mg, + /* .mul_mat_q = */ mmq, + /* .tensor_split = */ ts, + }; + instances.push_back(instance); + } + + for (const auto & n_gen : params.n_gen) { + if (n_gen == 0) { + continue; + } + cmd_params_instance instance = { + /* .model = */ m, + /* .n_prompt = */ 0, + /* .n_gen = */ n_gen, + /* .n_batch = */ nb, + /* .f32_kv = */ fk, + /* .n_threads = */ nt, + /* .n_gpu_layers = */ nl, + /* .main_gpu = */ mg, + /* .mul_mat_q = */ mmq, + /* .tensor_split = */ ts, + }; + instances.push_back(instance); + } + } +#else + // this ordering separates the prompt and generation tests for (const auto & n_prompt : params.n_prompt) { if (n_prompt == 0) { continue; @@ -419,6 +469,7 @@ static std::vector get_cmd_params_instances(const cmd_param auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0); instances.insert(instances.end(), instances_gen.begin(), instances_gen.end()); } +#endif return instances; } @@ -443,7 +494,6 @@ struct test { int n_gpu_layers; int main_gpu; bool mul_mat_q; - bool low_vram; std::array tensor_split; int n_prompt; int n_gen; @@ -463,7 +513,6 @@ struct test { n_gpu_layers = inst.n_gpu_layers; main_gpu = inst.main_gpu; mul_mat_q = inst.mul_mat_q; - low_vram = inst.low_vram; tensor_split = inst.tensor_split; n_prompt = inst.n_prompt; n_gen = inst.n_gen; @@ -524,7 +573,7 @@ struct test { "cpu_info", "gpu_info", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads", "f16_kv", - "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split", + "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" @@ -543,7 +592,7 @@ struct test { return INT; } if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" || - field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") { + field == "f16_kv" || field == "mul_mat_q") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -574,7 +623,7 @@ struct test { cpu_info, gpu_info, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv), - std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str, + std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str, std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ts()), std::to_string(stdev_ts()) @@ -606,9 +655,9 @@ struct printer { virtual ~printer() {} FILE * fout; - virtual void print_header(const cmd_params & params) { (void) params; }; + virtual void print_header(const cmd_params & params) { (void) params; } virtual void print_test(const test & t) = 0; - virtual void print_footer() { }; + virtual void print_footer() { } }; struct csv_printer : public printer { @@ -766,9 +815,6 @@ struct markdown_printer : public printer { if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) { fields.push_back("mul_mat_q"); } - if (params.low_vram.size() > 1 || params.low_vram != cmd_params_defaults.low_vram) { - fields.push_back("low_vram"); - } if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { fields.push_back("tensor_split"); } @@ -889,21 +935,27 @@ struct sql_printer : public printer { static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { std::vector tokens(n_batch, llama_token_bos(ctx)); int n_processed = 0; + + llama_set_n_threads(ctx, n_threads, n_threads); + while (n_processed < n_prompt) { int n_tokens = std::min(n_prompt - n_processed, n_batch); - llama_eval(ctx, tokens.data(), n_tokens, n_past + n_processed, n_threads); + llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0)); n_processed += n_tokens; } } static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { llama_token token = llama_token_bos(ctx); + + llama_set_n_threads(ctx, n_threads, n_threads); + for (int i = 0; i < n_gen; i++) { - llama_eval(ctx, &token, 1, n_past + i, n_threads); + llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0)); } } -static void llama_null_log_callback(enum llama_log_level level, const char * text, void * user_data) { +static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) { (void) level; (void) text; (void) user_data; @@ -958,17 +1010,25 @@ int main(int argc, char ** argv) { std::vector params_instances = get_cmd_params_instances(params); + llama_model * lmodel = nullptr; + const cmd_params_instance * prev_inst = nullptr; + for (const auto & inst : params_instances) { - // TODO: keep the model between tests when possible - llama_context_params lparams = inst.to_llama_params(); + // keep the same model between tests when possible + if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { + if (lmodel) { + llama_free_model(lmodel); + } - llama_model * lmodel = llama_load_model_from_file(inst.model.c_str(), lparams); - if (lmodel == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); - return 1; + lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams()); + if (lmodel == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); + return 1; + } + prev_inst = &inst; } - llama_context * ctx = llama_new_context_with_model(lmodel, lparams); + llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams()); if (ctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str()); llama_free_model(lmodel); @@ -977,6 +1037,8 @@ int main(int argc, char ** argv) { test t(inst, lmodel, ctx); + llama_kv_cache_tokens_rm(ctx, -1, -1); + // warmup run if (t.n_prompt > 0) { test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads); @@ -986,6 +1048,8 @@ int main(int argc, char ** argv) { } for (int i = 0; i < params.reps; i++) { + llama_kv_cache_tokens_rm(ctx, -1, -1); + uint64_t t_start = get_time_ns(); if (t.n_prompt > 0) { test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); @@ -1002,9 +1066,10 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); - llama_free_model(lmodel); } + llama_free_model(lmodel); + p->print_footer(); llama_backend_free(); diff --git a/examples/main/README.md b/examples/main/README.md index 26e1e28dd08c179a8726ee8eff8b8d61ac56af8d..a9561c383c0cba7873808626cc4114e25dc1865d 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -262,7 +262,8 @@ These options help improve the performance and memory usage of the LLaMA models. ### Number of Threads -- `-t N, --threads N`: Set the number of threads to use during computation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance. +- `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance. +- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. In some systems, it is beneficial to use a higher number of threads during batch processing than during generation. If not specified, the number of threads used for batch processing will be the same as the number of threads used for generation. ### Mlock @@ -305,6 +306,5 @@ These options provide extra functionality and customization when running the LLa - `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. -- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. diff --git a/examples/main/main.cpp b/examples/main/main.cpp index d78112260de08ef5d5cd613114120b057c3bd5f0..3a4ed3f7814f8ef5de86853f7a33cda4bfcae8dc 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -124,7 +124,7 @@ int main(int argc, char ** argv) { console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); - if (params.perplexity) { + if (params.logits_all) { printf("\n************\n"); printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); printf("************\n\n"); @@ -140,12 +140,17 @@ int main(int argc, char ** argv) { return 0; } - if (params.rope_freq_base != 10000.0) { - LOG_TEE("%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base); + if (params.n_ctx != 0 && params.n_ctx < 8) { + LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); + params.n_ctx = 8; + } + + if (params.rope_freq_base != 0.0) { + LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); } - if (params.rope_freq_scale != 1.0) { - LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale); + if (params.rope_freq_scale != 0.0) { + LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); @@ -184,29 +189,19 @@ int main(int argc, char ** argv) { return 1; } - const int n_ctx_train = llama_n_ctx_train(ctx); - if (params.n_ctx > n_ctx_train) { + const int n_ctx_train = llama_n_ctx_train(model); + const int n_ctx = llama_n_ctx(ctx); + LOG("n_ctx: %d\n", n_ctx); + + if (n_ctx > n_ctx_train) { LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, params.n_ctx); - } else if (params.n_ctx < 8) { - LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); - params.n_ctx = 8; + __func__, n_ctx_train, n_ctx); } // print system information { LOG_TEE("\n"); - LOG_TEE("system_info: n_threads = %d / %d | %s\n", - params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); - } - - // export the cgraph and exit - if (params.export_cgraph) { - llama_eval_export(ctx, "llama.ggml"); - llama_free(ctx); - llama_free_model(model); - - return 0; + LOG_TEE("%s\n", get_system_info(params).c_str()); } std::string path_session = params.path_prompt_cache; @@ -220,7 +215,7 @@ int main(int argc, char ** argv) { if (fp != NULL) { std::fclose(fp); - session_tokens.resize(params.n_ctx); + session_tokens.resize(n_ctx); size_t n_token_count_out = 0; if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); @@ -235,7 +230,7 @@ int main(int argc, char ** argv) { } } - const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; + const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM; LOG("add_bos: %d\n", add_bos); std::vector embd_inp; @@ -276,9 +271,6 @@ int main(int argc, char ** argv) { LOG("guidance_offset: %s", log_tostr(guidance_offset)); } - const int n_ctx = llama_n_ctx(ctx); - LOG("n_ctx: %d\n", n_ctx); - if ((int) embd_inp.size() > n_ctx - 4) { LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); return 1; @@ -475,7 +467,7 @@ int main(int argc, char ** argv) { std::vector embd; std::vector embd_guidance; - const int n_vocab = llama_n_vocab(ctx); + const int n_vocab = llama_n_vocab(model); std::vector candidates; candidates.reserve(n_vocab); @@ -508,17 +500,22 @@ int main(int argc, char ** argv) { break; } - const int n_left = n_past - params.n_keep; - LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d\n", n_past, n_left, n_ctx, params.n_keep); + const int n_left = n_past - params.n_keep - 1; + const int n_discard = n_left/2; - // always keep the first token - BOS - n_past = std::max(1, params.n_keep); - n_past_guidance = std::max(1, params.n_keep + guidance_offset); + LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + n_past, n_left, n_ctx, params.n_keep, n_discard); - LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance); + llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); + llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + + n_past -= n_discard; - // insert n_left/2 tokens at the start of embd from last_tokens - embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size()); + if (ctx_guidance) { + n_past_guidance -= n_discard; + } + + LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance); LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd)); @@ -580,7 +577,7 @@ int main(int argc, char ** argv) { for (int i = 0; i < input_size; i += params.n_batch) { int n_eval = std::min(input_size - i, params.n_batch); - if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) { + if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) { LOG_TEE("%s : failed to eval\n", __func__); return 1; } @@ -597,7 +594,7 @@ int main(int argc, char ** argv) { LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd)); - if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) { + if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { LOG_TEE("%s : failed to eval\n", __func__); return 1; } @@ -855,7 +852,7 @@ int main(int argc, char ** argv) { llama_backend_free(); #ifndef LOG_DISABLE_LOGS - LOG_TEE("Log end\n") + LOG_TEE("Log end\n"); #endif // LOG_DISABLE_LOGS return 0; diff --git a/examples/make-ggml.py b/examples/make-ggml.py index 6a34eeac53faaa149d288582435d3b82d3df7280..c73485ebf1eff956e3dd4d0fb29cbd4dfdce1bf1 100644 --- a/examples/make-ggml.py +++ b/examples/make-ggml.py @@ -1,22 +1,25 @@ #!/usr/bin/env python3 """ -This script converts Hugging Face llama models to GGML and quantizes them. +This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them. Usage: -python make-ggml.py --model {model_dir_or_hf_repo_name} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)] +python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)] Arguments: -- --model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub. +- model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub. +- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox. - --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used. - --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used. - --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'. - --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created. -Quant types: +Old quant types (some base model types require these): - Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M - Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L - Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M - Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M + +New quant types (recommended): - Q2_K: smallest, extreme quality loss - not recommended - Q3_K: alias for Q3_K_M - Q3_K_S: very small, very high quality loss @@ -40,9 +43,7 @@ import argparse import os from huggingface_hub import snapshot_download -def main(model, outname, outdir, quants, keep_fp16): - ggml_version = "v3" - +def main(model, model_type, outname, outdir, quants, keep_fp16): if not os.path.isdir(model): print(f"Model not found at {model}. Downloading...") try: @@ -63,17 +64,20 @@ def main(model, outname, outdir, quants, keep_fp16): print("Building llama.cpp") subprocess.run(f"cd .. && make quantize", shell=True, check=True) - fp16 = f"{outdir}/{outname}.ggml{ggml_version}.fp16.bin" + fp16 = f"{outdir}/{outname}.gguf.fp16.bin" - print(f"Making unquantised GGML at {fp16}") + print(f"Making unquantised GGUF at {fp16}") if not os.path.isfile(fp16): - subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True) + if model_type != "llama": + subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True) + else: + subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True) else: print(f"Unquantised GGML already exists at: {fp16}") print("Making quants") for type in quants: - outfile = f"{outdir}/{outname}.ggml{ggml_version}.{type}.bin" + outfile = f"{outdir}/{outname}.gguf.{type}.bin" print(f"Making {type} : {outfile}") subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True) @@ -81,8 +85,9 @@ def main(model, outname, outdir, quants, keep_fp16): os.remove(fp16) if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Convert/Quantize HF to GGML. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.') - parser.add_argument('--model', required=True, help='Downloaded model dir or Hugging Face model repo name') + parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.') + parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name') + parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.') parser.add_argument('--outname', default=None, help='Output model(s) name') parser.add_argument('--outdir', default=None, help='Output directory') parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types') @@ -90,4 +95,4 @@ if __name__ == "__main__": args = parser.parse_args() - main(args.model, args.outname, args.outdir, args.quants, args.keep_fp16) + main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16) diff --git a/examples/parallel/CMakeLists.txt b/examples/parallel/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bbf89eaefce6f28db83af04307353358ff6b69e --- /dev/null +++ b/examples/parallel/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET parallel) +add_executable(${TARGET} parallel.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/parallel/README.md b/examples/parallel/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4d0fe5cef12fa7e2d28e18571ba94d290e47999b --- /dev/null +++ b/examples/parallel/README.md @@ -0,0 +1,3 @@ +# llama.cpp/example/parallel + +Simplified simluation for serving incoming requests in parallel diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0434ded234b183cbd22fb23089e403664dc91518 --- /dev/null +++ b/examples/parallel/parallel.cpp @@ -0,0 +1,380 @@ +// A basic application simulating a server with multiple clients. +// The clients submite requests to the server and they are processed in parallel. + +#include "build-info.h" + +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include + +// trim whitespace from the beginning and end of a string +static std::string trim(const std::string & str) { + size_t start = 0; + size_t end = str.size(); + + while (start < end && isspace(str[start])) { + start += 1; + } + + while (end > start && isspace(str[end - 1])) { + end -= 1; + } + + return str.substr(start, end - start); +} + +static std::string k_system = +R"(Transcript of a never ending dialog, where the User interacts with an Assistant. +The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. + +User: Recommend a nice restaurant in the area. +Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays. +User: Who is Richard Feynman? +Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?". +User:)"; + +static std::vector k_prompts = { + "What is the meaning of life?", + "Tell me an interesting fact about llamas.", + "What is the best way to cook a steak?", + "Are you familiar with the Special Theory of Relativity and can you explain it to me?", + "Recommend some interesting books to read.", + "What is the best way to learn a new language?", + "How to get a job at Google?", + "If you could have any superpower, what would it be?", + "I want to learn how to play the piano.", +}; + +struct client { + int32_t id = 0; + + llama_seq_id seq_id = -1; + + llama_token sampled; + + int64_t t_start_prompt; + int64_t t_start_gen; + + int32_t n_prompt = 0; + int32_t n_decoded = 0; + int32_t i_batch = -1; + + std::string input; + std::string prompt; + std::string response; + + std::vector tokens_prev; +}; + +int main(int argc, char ** argv) { + srand(1234); + + gpt_params params; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + // number of simultaneous "clients" to simulate + const int32_t n_clients = params.n_parallel; + + // requests to simulate + const int32_t n_seq = params.n_sequences; + + // insert new requests as soon as the previous one is done + const bool cont_batching = params.cont_batching; + +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("parallel", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); +#endif // LOG_DISABLE_LOGS + + // init llama.cpp + llama_backend_init(params.numa); + + llama_model * model = NULL; + llama_context * ctx = NULL; + + // load the target model + params.logits_all = true; + std::tie(model, ctx) = llama_init_from_gpt_params(params); + + fprintf(stderr, "\n\n"); + fflush(stderr); + + const int n_ctx = llama_n_ctx(ctx); + const int n_vocab = llama_n_vocab(model); + + std::vector clients(n_clients); + for (size_t i = 0; i < clients.size(); ++i) { + auto & client = clients[i]; + client.id = i; + client.tokens_prev.resize(std::max(256, params.n_predict)); + std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0); + } + + std::vector candidates; + candidates.reserve(n_vocab); + + std::vector tokens_system; + tokens_system = ::llama_tokenize(ctx, k_system, true); + const int32_t n_tokens_system = tokens_system.size(); + + llama_seq_id g_seq_id = 0; + + // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple + // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time + llama_batch batch = llama_batch_init(params.n_ctx, 0); + + int32_t n_total_prompt = 0; + int32_t n_total_gen = 0; + int32_t n_cache_miss = 0; + + const auto t_main_start = ggml_time_us(); + + LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__); + LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); + LOG_TEE("\n"); + + { + LOG_TEE("%s: Evaluating the system prompt ...\n", __func__); + + batch.n_tokens = n_tokens_system; + + for (int32_t i = 0; i < batch.n_tokens; ++i) { + batch.token[i] = tokens_system[i]; + batch.pos[i] = i; + batch.seq_id[i] = 0; + batch.logits[i] = false; + } + + if (llama_decode(ctx, batch) != 0) { + LOG_TEE("%s: llama_decode() failed\n", __func__); + return 1; + } + + // assign the system KV cache to all parallel sequences + for (int32_t i = 1; i < n_clients; ++i) { + llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system); + } + + LOG_TEE("\n"); + } + + LOG_TEE("Processing requests ...\n\n"); + + while (true) { + batch.n_tokens = 0; + + // decode any currently ongoing sequences + for (auto & client : clients) { + if (client.seq_id == -1) { + continue; + } + + batch.token [batch.n_tokens] = client.sampled; + batch.pos [batch.n_tokens] = n_tokens_system + client.n_prompt + client.n_decoded; + batch.seq_id[batch.n_tokens] = client.id; + batch.logits[batch.n_tokens] = true; + + client.n_decoded += 1; + client.i_batch = batch.n_tokens; + + batch.n_tokens += 1; + } + + if (batch.n_tokens == 0) { + // all sequences have ended - clear the entire KV cache + for (int i = 0; i < n_clients; ++i) { + llama_kv_cache_seq_rm(ctx, i, n_tokens_system, -1); + } + + LOG_TEE("%s: clearing the KV cache\n", __func__); + } + + // insert new sequences for decoding + if (cont_batching || batch.n_tokens == 0) { + for (auto & client : clients) { + if (client.seq_id == -1 && g_seq_id < n_seq) { + client.seq_id = g_seq_id; + + client.t_start_prompt = ggml_time_us(); + client.t_start_gen = 0; + + client.input = k_prompts[rand() % k_prompts.size()]; + client.prompt = client.input + "\nAssistant:"; + client.response = ""; + + std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0); + + // do not prepend BOS because we have a system prompt! + std::vector tokens_prompt; + tokens_prompt = ::llama_tokenize(ctx, client.prompt, false); + + for (size_t i = 0; i < tokens_prompt.size(); ++i) { + batch.token [batch.n_tokens] = tokens_prompt[i]; + batch.pos [batch.n_tokens] = i + n_tokens_system; + batch.seq_id[batch.n_tokens] = client.id; + batch.logits[batch.n_tokens] = false; + batch.n_tokens += 1; + } + + // extract the logits only for the last token + if (batch.n_tokens > 0) { + batch.logits[batch.n_tokens - 1] = true; + } + + client.n_prompt = tokens_prompt.size(); + client.n_decoded = 0; + client.i_batch = batch.n_tokens - 1; + + LOG_TEE("\033[1mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id); + + g_seq_id += 1; + + // insert new requests one-by-one + //if (cont_batching) { + // break; + //} + } + } + } + + if (batch.n_tokens == 0) { + break; + } + + // process in chunks of params.n_batch + int32_t n_batch = params.n_batch; + + for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { + // experiment: process in powers of 2 + //if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) { + // n_batch /= 2; + // i -= n_batch; + // continue; + //} + + const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); + + llama_batch batch_view = { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.seq_id + i, + batch.logits + i, + 0, 0, 0, // unused + }; + + const int ret = llama_decode(ctx, batch_view); + if (ret != 0) { + if (n_batch == 1 || ret < 0) { + // if you get here, it means the KV cache is full - try increasing it via the context size + LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); + return 1; + } + + LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2); + + n_cache_miss += 1; + + // retry with half the batch size to try to find a free slot in the KV cache + n_batch /= 2; + i -= n_batch; + + continue; + } + + LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens); + + for (auto & client : clients) { + if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) { + continue; + } + + //printf("client %d, seq %d, token %d, pos %d, batch %d\n", + // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch); + + const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i); + + if (client.n_decoded == 1) { + // start measuring generation time after the first token to make sure all concurrent clients + // have their prompt already processed + client.t_start_gen = ggml_time_us(); + } + + // remember which tokens were sampled - used for repetition penalties during sampling + client.tokens_prev.erase(client.tokens_prev.begin()); + client.tokens_prev.push_back(id); + + const std::string token_str = llama_token_to_piece(ctx, id); + client.response += token_str; + client.sampled = id; + + //printf("client %d, seq %d, token %d, pos %d, batch %d: %s\n", + // client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str()); + + if (client.n_decoded > 2 && + (id == llama_token_eos(ctx) || + (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) || + client.response.find("User:") != std::string::npos || + client.response.find('\n') != std::string::npos)) { + // basic reverse prompt + const size_t pos = client.response.find("User:"); + if (pos != std::string::npos) { + client.response = client.response.substr(0, pos); + } + + // delete only the generated part of the sequence, i.e. keep the system prompt in the cache + llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, n_ctx); + + const auto t_main_end = ggml_time_us(); + + LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\nResponse: %s\n\n", + client.id, client.seq_id, client.n_prompt, client.n_decoded, + (t_main_end - client.t_start_prompt) / 1e6, + (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6, + n_cache_miss, + ::trim(client.input).c_str(), + ::trim(client.response).c_str()); + + n_total_prompt += client.n_prompt; + n_total_gen += client.n_decoded; + + client.seq_id = -1; + } + + client.i_batch = -1; + } + } + } + + const auto t_main_end = ggml_time_us(); + + LOG_TEE("\n\n"); + LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6); + LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6); + LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6); + LOG_TEE("Cache misses: %6d\n", n_cache_miss); + + LOG_TEE("\n\n"); + + llama_print_timings(ctx); + + llama_batch_free(batch); + + llama_free(ctx); + llama_free_model(model); + + llama_backend_free(); + + fprintf(stderr, "\n\n"); + + return 0; +} diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md index eacfb17c67fb2e8a476d9bc58b84a34c9942b24c..50e1af0111dd64f915fc7be75bdd92258c2084dc 100644 --- a/examples/perplexity/README.md +++ b/examples/perplexity/README.md @@ -1,3 +1,21 @@ # perplexity TODO + +## Llama 2 70B Scorechart +Quantization | Model size (GiB) | Perplexity | Delta to fp16 +-- | -- | -- | -- +Q4_0 | 36.20 | 3.5550 | 3.61% +Q4_1 | 40.20 | 3.5125 | 2.37% +Q5_0 | 44.20 | 3.4744 | 1.26% +Q2_K | 27.27 | 3.7339 | 8.82% +Q3_K_S | 27.86 | 3.7019 | 7.89% +Q3_K_M | 30.83 | 3.5932 | 4.72% +Q3_K_L | 33.67 | 3.5617 | 3.80% +Q4_K_S | 36.39 | 3.4852 | 1.57% +Q4_K_M | 38.54 | 3.4725 | 1.20% +Q5_K_S | 44.20 | 3.4483 | 0.50% +Q5_K_M | 45.41 | 3.4451 | 0.40% +Q6_K | 52.70 | 3.4367 | 0.16% +fp16 | 128.5 | 3.4313 | - + diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 2b375e34e7234685f70240a4c889dfc9dcf6cad7..7d0038bd4075735d73bbf7b19b997fdb45dea940 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -80,7 +80,9 @@ static void write_logfile( static std::vector softmax(const std::vector& logits) { std::vector probs(logits.size()); float max_logit = logits[0]; - for (float v : logits) max_logit = std::max(max_logit, v); + for (float v : logits) { + max_logit = std::max(max_logit, v); + } double sum_exp = 0.0; for (size_t i = 0; i < logits.size(); i++) { // Subtract the maximum logit value from the current logit value for numerical stability @@ -89,15 +91,21 @@ static std::vector softmax(const std::vector& logits) { sum_exp += exp_logit; probs[i] = exp_logit; } - for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp; + for (size_t i = 0; i < probs.size(); i++) { + probs[i] /= sum_exp; + } return probs; } static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { float max_logit = logits[0]; - for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]); + for (int i = 1; i < n_vocab; ++i) { + max_logit = std::max(max_logit, logits[i]); + } double sum_exp = 0.0; - for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit); + for (int i = 0; i < n_vocab; ++i) { + sum_exp += expf(logits[i] - max_logit); + } return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; } @@ -108,7 +116,8 @@ static void process_logits( std::mutex mutex; int counter = 0; auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { - double local_nll = 0, local_nll2 = 0; + double local_nll = 0; + double local_nll2 = 0; while (true) { std::unique_lock lock(mutex); int i = counter++; @@ -126,10 +135,13 @@ static void process_logits( prob_history[i] = results.prob; } }; - for (auto & w : workers) w = std::thread(compute); + for (auto & w : workers) { + w = std::thread(compute); + } compute(); - for (auto & w : workers) w.join(); - + for (auto & w : workers) { + w.join(); + } } static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { @@ -138,22 +150,24 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & // Output: `perplexity: 13.5106 [114/114]` // BOS tokens will be added for each chunk before eval - const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; + const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM; const bool add_bos = is_spm; fprintf(stderr, "%s: tokenizing the input ..\n", __func__); std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); - if (int(tokens.size()) < 2*params.n_ctx) { - fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx, - params.n_ctx); + const int n_ctx = llama_n_ctx(ctx); + + if (int(tokens.size()) < 2*n_ctx) { + fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, + n_ctx); fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); return {std::move(tokens), 0., {}, {}}; } - std::vector logit_history; - std::vector prob_history; + std::vector logit_history; + std::vector prob_history; logit_history.resize(tokens.size()); prob_history.resize(tokens.size()); @@ -163,20 +177,20 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & return {tokens, -1, logit_history, prob_history}; } - const int calc_chunk = params.n_ctx; + const int calc_chunk = n_ctx; fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk); if (int(tokens.size()) <= calc_chunk) { fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__, - tokens.size(), params.n_ctx, params.ppl_stride); + tokens.size(), n_ctx, params.ppl_stride); return {tokens, -1, logit_history, prob_history}; } const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride; const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); - const int n_vocab = llama_n_vocab(ctx); + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_batch = params.n_batch; int count = 0; @@ -195,12 +209,15 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & const auto t_start = std::chrono::high_resolution_clock::now(); + // clear the KV cache + llama_kv_cache_tokens_rm(ctx, -1, -1); + for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; const int batch_size = std::min(end - batch_start, n_batch); //fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch); - if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) { + if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { //fprintf(stderr, "%s : failed to eval\n", __func__); return {tokens, -1, logit_history, prob_history}; } @@ -235,7 +252,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & } //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start); - for (int j = params.n_ctx - params.ppl_stride - 1; j < params.n_ctx - 1; ++j) { + for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) { // Calculate probability of next token, given the previous ones. const std::vector tok_logits( @@ -272,8 +289,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par // Output: `perplexity: 13.5106 [114/114]` // BOS tokens will be added for each chunk before eval - const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; + const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM; const bool add_bos = is_spm; + const int n_ctx = llama_n_ctx(ctx); auto tim1 = std::chrono::high_resolution_clock::now(); fprintf(stderr, "%s: tokenizing the input ..\n", __func__); @@ -283,9 +301,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par auto tim2 = std::chrono::high_resolution_clock::now(); fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); - if (int(tokens.size()) < 2*params.n_ctx) { - fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx, - params.n_ctx); + if (int(tokens.size()) < 2*n_ctx) { + fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, + n_ctx); fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); return {std::move(tokens), 0., {}, {}}; } @@ -296,10 +314,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par std::vector prob_history; prob_history.resize(tokens.size()); - const int n_chunk_max = tokens.size() / params.n_ctx; + const int n_chunk_max = tokens.size() / n_ctx; const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); - const int n_vocab = llama_n_vocab(ctx); + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_batch = params.n_batch; int count = 0; @@ -311,15 +329,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par std::vector workers(std::thread::hardware_concurrency() - 1); for (int i = 0; i < n_chunk; ++i) { - const int start = i * params.n_ctx; - const int end = start + params.n_ctx; + const int start = i * n_ctx; + const int end = start + n_ctx; - const int num_batches = (params.n_ctx + n_batch - 1) / n_batch; + const int num_batches = (n_ctx + n_batch - 1) / n_batch; std::vector logits; const auto t_start = std::chrono::high_resolution_clock::now(); + // clear the KV cache + llama_kv_cache_tokens_rm(ctx, -1, -1); + for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; const int batch_size = std::min(end - batch_start, n_batch); @@ -332,7 +353,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par tokens[batch_start] = llama_token_bos(ctx); } - if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) { + if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { fprintf(stderr, "%s : failed to eval\n", __func__); return {tokens, -1, logit_history, prob_history}; } @@ -340,7 +361,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par // restore the original token in case it was set to BOS tokens[batch_start] = token_org; - const auto batch_logits = llama_get_logits(ctx); + const auto * batch_logits = llama_get_logits(ctx); logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); } @@ -369,10 +390,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par // Example, we have a context window of 512, we will compute perplexity for each of the // last 256 tokens. Then, we split the input up into context window size chunks to // process the entire prompt. - const int first = params.n_ctx/2; - process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, + const int first = n_ctx/2; + process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); - count += params.n_ctx - first - 1; + count += n_ctx - first - 1; // perplexity is e^(average negative log-likelihood) if (params.ppl_output_type == 0) { @@ -381,7 +402,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par double av = nll/count; double av2 = nll2/count - av*av; if (av2 > 0) av2 = sqrt(av2/(count-1)); - printf("%8d %.4lf %4lf %4lf\n", i*params.n_ctx, std::exp(nll / count), av, av2); + printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); } fflush(stdout); } @@ -402,7 +423,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par } static std::vector hellaswag_evaluate_tokens( - llama_context * ctx, const std::vector& tokens, int n_past, int n_batch, int n_vocab, int n_thread + llama_context * ctx, std::vector & tokens, int n_past, int n_batch, int n_vocab ) { std::vector result; result.reserve(tokens.size() * n_vocab); @@ -410,7 +431,7 @@ static std::vector hellaswag_evaluate_tokens( for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) { size_t n_tokens = tokens.size() - i_chunk * n_batch; n_tokens = std::min(n_tokens, size_t(n_batch)); - if (llama_eval(ctx, tokens.data() + i_chunk * n_batch, n_tokens, n_past, n_thread)) { + if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0))) { fprintf(stderr, "%s : failed to eval\n", __func__); return {}; } @@ -457,7 +478,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { size_t hs_task_count = prompt_lines.size()/6; fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); - const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; + const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM; fprintf(stderr, "================================= is_spm = %d\n", is_spm); // This is needed as usual for LLaMA models @@ -512,7 +533,8 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { printf("\ntask\tacc_norm\n"); double acc = 0.0f; - const int n_vocab = llama_n_vocab(ctx); + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int n_ctx = llama_n_ctx(ctx); std::vector> ending_tokens(4); @@ -540,7 +562,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { auto query_size = query_embd.size(); // Stop if query wont fit the ctx window - if (query_size > (size_t)params.n_ctx) { + if (query_size > (size_t)n_ctx) { fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size); return; } @@ -550,7 +572,10 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { query_embd.resize(32); } - auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads); + // clear the KV cache + llama_kv_cache_tokens_rm(ctx, -1, -1); + + auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab); if (logits.empty()) { fprintf(stderr, "%s : failed to eval\n", __func__); return; @@ -587,7 +612,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { query_size = query_embd.size(); // Stop if query wont fit the ctx window - if (context_size + query_size > (size_t)params.n_ctx) { + if (context_size + query_size > (size_t)n_ctx) { fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size); return; } @@ -599,7 +624,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { //} // Evaluate the query - logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab, params.n_threads); + logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab); if (logits.empty()) { fprintf(stderr, "%s : failed to eval\n", __func__); return; @@ -661,7 +686,7 @@ int main(int argc, char ** argv) { return 1; } - params.perplexity = true; + params.logits_all = true; params.n_batch = std::min(params.n_batch, params.n_ctx); if (params.ppl_stride > 0) { @@ -695,7 +720,7 @@ int main(int argc, char ** argv) { return 1; } - const int n_ctx_train = llama_n_ctx_train(ctx); + const int n_ctx_train = llama_n_ctx_train(model); if (params.n_ctx > n_ctx_train) { fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, params.n_ctx); @@ -704,8 +729,7 @@ int main(int argc, char ** argv) { // print system information { fprintf(stderr, "\n"); - fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", - params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + fprintf(stderr, "%s\n", get_system_info(params).c_str()); } struct results_perplexity results; diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 94edb94d958d06d14e18436423a977f04b86b49f..dd76b1ceef134d2cdafe01c9c458a6bd32ee2abb 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -309,21 +309,22 @@ int main(int argc, char ** argv) { llama_context * ctx; { - auto lparams = llama_context_default_params(); + auto mparams = llama_model_default_params(); + mparams.use_mlock = false; - lparams.n_ctx = 256; - lparams.seed = 1; - lparams.f16_kv = false; - lparams.use_mlock = false; - - model = llama_load_model_from_file(params.model.c_str(), lparams); + model = llama_load_model_from_file(params.model.c_str(), mparams); if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); return 1; } - ctx = llama_new_context_with_model(model, lparams); + auto cparams = llama_context_default_params(); + cparams.n_ctx = 256; + cparams.seed = 1; + cparams.f16_kv = false; + + ctx = llama_new_context_with_model(model, cparams); if (ctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); diff --git a/examples/quantize/README.md b/examples/quantize/README.md index f349e913e3d10dc84ca1e045048e92ede8a3d5d6..c8b9a27a0b04e47aed9706045d7539b4aff670f1 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -1,3 +1,44 @@ # quantize TODO + +## Llama 2 7B + +Quantization | Bits per Weight (BPW) +-- | -- +Q2_K | 3.35 +Q3_K_S | 3.50 +Q3_K_M | 3.91 +Q3_K_L | 4.27 +Q4_K_S | 4.58 +Q4_K_M | 4.84 +Q5_K_S | 5.52 +Q5_K_M | 5.68 +Q6_K | 6.56 + +## Llama 2 13B +Quantization | Bits per Weight (BPW) +-- | -- +Q2_K | 3.34 +Q3_K_S | 3.48 +Q3_K_M | 3.89 +Q3_K_L | 4.26 +Q4_K_S | 4.56 +Q4_K_M | 4.83 +Q5_K_S | 5.51 +Q5_K_M | 5.67 +Q6_K | 6.56 + +# Llama 2 70B + +Quantization | Bits per Weight (BPW) +-- | -- +Q2_K | 3.40 +Q3_K_S | 3.47 +Q3_K_M | 3.85 +Q3_K_L | 4.19 +Q4_K_S | 4.53 +Q4_K_M | 4.80 +Q5_K_S | 5.50 +Q5_K_M | 5.65 +Q6_K | 6.56 diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 1c1d957e63d5b33430ef327ba388d7478071ecfa..c7dd0d894634cbb242a5dcdbb2fbe45cbe8f104e 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -72,6 +72,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp // usage: // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // +[[noreturn]] static void usage(const char * executable) { printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 95527bb863524f80e58fccff86c967a70b89995b..acc6dbdfd07d05f7a7462078daf8cbd54dd0965b 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -23,23 +23,17 @@ int main(int argc, char ** argv) { params.n_predict = 16; } - auto lparams = llama_context_default_params(); - - lparams.n_ctx = params.n_ctx; - lparams.seed = params.seed; - lparams.f16_kv = params.memory_f16; - lparams.use_mmap = params.use_mmap; - lparams.use_mlock = params.use_mlock; - auto n_past = 0; auto last_n_tokens_data = std::vector(params.repeat_last_n, 0); // init - auto model = llama_load_model_from_file(params.model.c_str(), lparams); + llama_model * model; + llama_context * ctx; + + std::tie(model, ctx) = llama_init_from_gpt_params( params ); if (model == nullptr) { return 1; } - auto ctx = llama_new_context_with_model(model, lparams); if (ctx == nullptr) { llama_free_model(model); return 1; @@ -54,7 +48,7 @@ int main(int argc, char ** argv) { } // evaluate prompt - llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads); + llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt_tokens, n_past, 0)); last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens); n_past += n_prompt_tokens; @@ -78,8 +72,8 @@ int main(int argc, char ** argv) { printf("\n%s", params.prompt.c_str()); for (auto i = 0; i < params.n_predict; i++) { - auto logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(ctx); + auto * logits = llama_get_logits(ctx); + auto n_vocab = llama_n_vocab(model); std::vector candidates; candidates.reserve(n_vocab); for (llama_token token_id = 0; token_id < n_vocab; token_id++) { @@ -91,7 +85,7 @@ int main(int argc, char ** argv) { last_n_tokens_data.push_back(next_token); printf("%s", next_token_str.c_str()); - if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) { + if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); llama_free(ctx); llama_free_model(model); @@ -106,7 +100,7 @@ int main(int argc, char ** argv) { llama_free(ctx); // make new context - auto ctx2 = llama_new_context_with_model(model, lparams); + auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params)); // Load state (rng, logits, embedding and kv_cache) from file { @@ -138,8 +132,8 @@ int main(int argc, char ** argv) { // second run for (auto i = 0; i < params.n_predict; i++) { - auto logits = llama_get_logits(ctx2); - auto n_vocab = llama_n_vocab(ctx2); + auto * logits = llama_get_logits(ctx2); + auto n_vocab = llama_n_vocab(model); std::vector candidates; candidates.reserve(n_vocab); for (llama_token token_id = 0; token_id < n_vocab; token_id++) { @@ -151,7 +145,7 @@ int main(int argc, char ** argv) { last_n_tokens_data.push_back(next_token); printf("%s", next_token_str.c_str()); - if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) { + if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); llama_free(ctx2); llama_free_model(model); diff --git a/examples/server/README.md b/examples/server/README.md index 5176080463839618087ff6dd90bfd6ef7d6ba907..d409e8408f192df6c5ce7f331a73103e3bdf06fe 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -4,14 +4,14 @@ This example demonstrates a simple HTTP API server and a simple web front end to Command line options: -- `--threads N`, `-t N`: Set the number of threads to use during computation. +- `--threads N`, `-t N`: Set the number of threads to use during generation. +- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). - `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. -- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS. - `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`. - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ebd7f2fc579e992245574b4f58e2b7abe24db4f7..fe9a4255e768006e5abd036edbb529205e190cf5 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -200,6 +200,7 @@ struct llama_server_context llama_model *model = nullptr; llama_context *ctx = nullptr; gpt_params params; + int n_ctx; grammar_parser::parse_state parsed_grammar; llama_grammar *grammar = nullptr; @@ -239,7 +240,7 @@ struct llama_server_context num_prompt_tokens = 0; num_tokens_predicted = 0; generated_text = ""; - generated_text.reserve(params.n_ctx); + generated_text.reserve(n_ctx); generated_token_probs.clear(); truncated = false; stopped_eos = false; @@ -265,8 +266,8 @@ struct llama_server_context LOG_ERROR("unable to load model", {{"model", params_.model}}); return false; } - - last_n_tokens.resize(params.n_ctx); + n_ctx = llama_n_ctx(ctx); + last_n_tokens.resize(n_ctx); std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); return true; } @@ -351,19 +352,19 @@ struct llama_server_context { params.n_keep = (int)num_prompt_tokens; } - params.n_keep = std::min(params.n_ctx - 4, params.n_keep); + params.n_keep = std::min(n_ctx - 4, params.n_keep); // if input prompt is too big, truncate like normal - if (num_prompt_tokens >= (size_t)params.n_ctx) + if (num_prompt_tokens >= (size_t)n_ctx) { - const int n_left = (params.n_ctx - params.n_keep) / 2; + const int n_left = (n_ctx - params.n_keep) / 2; std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left; new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end()); - std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin()); + std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), last_n_tokens.begin()); LOG_VERBOSE("input truncated", { - {"n_ctx", params.n_ctx}, + {"n_ctx", n_ctx}, {"n_keep", params.n_keep}, {"n_left", n_left}, {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())}, @@ -381,6 +382,10 @@ struct llama_server_context // compare the evaluated prompt with the new prompt n_past = common_part(embd, prompt_tokens); + + // since #3228 we now have to manually manage the KV cache + llama_kv_cache_seq_rm(ctx, 0, n_past, params.n_ctx); + embd = prompt_tokens; if (n_past == num_prompt_tokens) { @@ -409,21 +414,29 @@ struct llama_server_context completion_token_output result; result.tok = -1; - if (embd.size() >= (size_t)params.n_ctx) + if (embd.size() >= (size_t)n_ctx) { - // Reset context - const int n_left = (params.n_ctx - params.n_keep) / 2; + // Shift context + + const int n_left = n_past - params.n_keep - 1; + const int n_discard = n_left/2; + + llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); + llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + + for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++) + { + embd[i - n_discard] = embd[i]; + } + embd.resize(embd.size() - n_discard); + + n_past -= n_discard; - std::vector new_tokens(embd.begin(), embd.begin() + params.n_keep); - new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end()); - embd = new_tokens; - n_past = params.n_keep; truncated = true; LOG_VERBOSE("input truncated", { - {"n_ctx", params.n_ctx}, + {"n_ctx", n_ctx}, {"n_keep", params.n_keep}, {"n_left", n_left}, - {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())}, }); } @@ -434,12 +447,12 @@ struct llama_server_context { n_eval = params.n_batch; } - if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads)) + + if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0))) { LOG_ERROR("failed to eval", { {"n_eval", n_eval}, {"n_past", n_past}, - {"n_threads", params.n_threads}, {"embd", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())}, }); has_next_token = false; @@ -457,11 +470,11 @@ struct llama_server_context // out of user input, sample next token const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(model) : params.top_k; const float top_p = params.top_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; - const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n; + const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; const float repeat_penalty = params.repeat_penalty; const float alpha_presence = params.presence_penalty; const float alpha_frequency = params.frequency_penalty; @@ -473,7 +486,7 @@ struct llama_server_context { auto *logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(ctx); + auto n_vocab = llama_n_vocab(model); // Apply params.logit_bias map for (const auto &it : params.logit_bias) @@ -492,7 +505,7 @@ struct llama_server_context // Apply penalties float nl_logit = logits[llama_token_nl(ctx)]; - auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx); + auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); llama_sample_repetition_penalty(ctx, &candidates_p, last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, last_n_repeat, repeat_penalty); @@ -523,13 +536,13 @@ struct llama_server_context { static float mirostat_mu = 2.0f * mirostat_tau; const int mirostat_m = 100; - llama_sample_temperature(ctx, &candidates_p, temp); + llama_sample_temp(ctx, &candidates_p, temp); result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); } else if (mirostat == 2) { static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temperature(ctx, &candidates_p, temp); + llama_sample_temp(ctx, &candidates_p, temp); result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); } else @@ -540,7 +553,7 @@ struct llama_server_context llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep); llama_sample_typical(ctx, &candidates_p, typical_p, min_keep); llama_sample_top_p(ctx, &candidates_p, top_p, min_keep); - llama_sample_temperature(ctx, &candidates_p, temp); + llama_sample_temp(ctx, &candidates_p, temp); result.tok = llama_sample_token(ctx, &candidates_p); } } @@ -677,7 +690,7 @@ struct llama_server_context std::vector getEmbedding() { - static const int n_embd = llama_n_embd(ctx); + static const int n_embd = llama_n_embd(model); if (!params.embedding) { LOG_WARNING("embedding disabled", { @@ -721,7 +734,6 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" -ts SPLIT --tensor-split SPLIT\n"); printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); - printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n"); printf(" -nommq, --no-mul-mat-q\n"); printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); printf(" Not recommended since this is both slower and uses more VRAM.\n"); @@ -905,14 +917,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } #else LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {}); -#endif // GGML_USE_CUBLAS - } - else if (arg == "--low-vram" || arg == "-lv") - { -#ifdef GGML_USE_CUBLAS - params.low_vram = true; -#else - LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {}); #endif // GGML_USE_CUBLAS } else if (arg == "--no-mul-mat-q" || arg == "-nommq") @@ -943,7 +947,23 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, invalid_param = true; break; } - params.lora_adapter = argv[i]; + params.lora_adapter.push_back({argv[i], 1.0f}); + params.use_mmap = false; + } + else if (arg == "--lora-scaled") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + const char * lora_adapter = argv[i]; + if (++i >= argc) + { + invalid_param = true; + break; + } + params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])}); params.use_mmap = false; } else if (arg == "--lora-base") @@ -1002,7 +1022,7 @@ static json format_generation_settings(llama_server_context &llama) eos_bias->second < 0.0f && std::isinf(eos_bias->second); return json{ - {"n_ctx", llama.params.n_ctx}, + {"n_ctx", llama.n_ctx}, {"model", llama.params.model_alias}, {"seed", llama.params.seed}, {"temp", llama.params.temp}, @@ -1162,7 +1182,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla const auto &logit_bias = body.find("logit_bias"); if (logit_bias != body.end() && logit_bias->is_array()) { - const int n_vocab = llama_n_vocab(llama.ctx); + const int n_vocab = llama_n_vocab(llama.model); for (const auto &el : *logit_bias) { if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) @@ -1295,6 +1315,7 @@ int main(int argc, char **argv) {"commit", BUILD_COMMIT}}); LOG_INFO("system info", { {"n_threads", params.n_threads}, + {"n_threads_batch", params.n_threads_batch}, {"total_threads", std::thread::hardware_concurrency()}, {"system_info", llama_print_system_info()}, }); @@ -1358,7 +1379,7 @@ int main(int argc, char **argv) if (llama.params.n_beams) { // Fill llama.generated_token_probs vector with final beam. llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams, - llama.n_past, llama.n_remain, llama.params.n_threads); + llama.n_past, llama.n_remain); // Translate llama.generated_token_probs to llama.generated_text. append_to_generated_text_from_generated_token_probs(llama); } else { diff --git a/examples/simple/README.md b/examples/simple/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5d24b1046935c087bd1afbebfde3011a4589a1ce --- /dev/null +++ b/examples/simple/README.md @@ -0,0 +1,21 @@ +# llama.cpp/example/simple + +The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt. + +```bash +./simple ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" + +... + +main: n_len = 32, n_ctx = 2048, n_parallel = 1, n_kv_req = 32 + + Hello my name is Shawn and I'm a 20 year old male from the United States. I'm a 20 year old + +main: decoded 27 tokens in 2.31 s, speed: 11.68 t/s + +llama_print_timings: load time = 579.15 ms +llama_print_timings: sample time = 0.72 ms / 28 runs ( 0.03 ms per token, 38888.89 tokens per second) +llama_print_timings: prompt eval time = 655.63 ms / 10 tokens ( 65.56 ms per token, 15.25 tokens per second) +llama_print_timings: eval time = 2180.97 ms / 27 runs ( 80.78 ms per token, 12.38 tokens per second) +llama_print_timings: total time = 2891.13 ms +``` diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 440d22ecfb4a8917816dd7ecb4967be858ddc1a8..24fb16b78d0581e92f6b3ec04fa77b2d44ba7208 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -26,35 +26,62 @@ int main(int argc, char ** argv) { params.prompt = "Hello my name is"; } + // total length of the sequence including the prompt + const int n_len = 32; + // init LLM llama_backend_init(params.numa); - llama_context_params ctx_params = llama_context_default_params(); + // initialize the model + + llama_model_params model_params = llama_model_default_params(); - llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params); + // model_params.n_gpu_layers = 99; // offload all layers to the GPU + + llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); if (model == NULL) { fprintf(stderr , "%s: error: unable to load model\n" , __func__); return 1; } + // initialize the context + + llama_context_params ctx_params = llama_context_default_params(); + + ctx_params.seed = 1234; + ctx_params.n_ctx = 2048; + ctx_params.n_threads = params.n_threads; + ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + llama_context * ctx = llama_new_context_with_model(model, ctx_params); + if (ctx == NULL) { + fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + return 1; + } + // tokenize the prompt std::vector tokens_list; tokens_list = ::llama_tokenize(ctx, params.prompt, true); - const int max_context_size = llama_n_ctx(ctx); - const int max_tokens_list_size = max_context_size - 4; + const int n_ctx = llama_n_ctx(ctx); + const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); + + LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req); - if ((int) tokens_list.size() > max_tokens_list_size) { - fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size); + // make sure the KV cache is big enough to hold all the prompt and generated tokens + if (n_kv_req > n_ctx) { + LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); + LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__); return 1; } - fprintf(stderr, "\n\n"); + // print the prompt token-by-token + + fprintf(stderr, "\n"); for (auto id : tokens_list) { fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); @@ -62,63 +89,104 @@ int main(int argc, char ** argv) { fflush(stderr); - // main loop + // create a llama_batch with size 512 + // we use this object to submit token data for decoding - // The LLM keeps a contextual cache memory of previous token evaluation. - // Usually, once this cache is full, it is required to recompute a compressed context based on previous - // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist - // example, we will just stop the loop once this cache is full or once an end of stream is detected. + llama_batch batch = llama_batch_init(512, 0); - const int n_gen = std::min(32, max_context_size); + // evaluate the initial prompt + batch.n_tokens = tokens_list.size(); - while (llama_get_kv_cache_token_count(ctx) < n_gen) { - // evaluate the transformer + for (int32_t i = 0; i < batch.n_tokens; i++) { + batch.token[i] = tokens_list[i]; + batch.pos[i] = i; + batch.seq_id[i] = 0; + batch.logits[i] = false; + } - if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return 1; - } + // llama_decode will output logits only for the last token of the prompt + batch.logits[batch.n_tokens - 1] = true; - tokens_list.clear(); + if (llama_decode(ctx, batch) != 0) { + LOG_TEE("%s: llama_decode() failed\n", __func__); + return 1; + } + // main loop + + int n_cur = batch.n_tokens; + int n_decode = 0; + + const auto t_main_start = ggml_time_us(); + + while (n_cur <= n_len) { // sample the next token + { + auto n_vocab = llama_n_vocab(model); + auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1); - llama_token new_token_id = 0; + std::vector candidates; + candidates.reserve(n_vocab); - auto logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(ctx); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); + } - std::vector candidates; - candidates.reserve(n_vocab); + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); - } + // sample the most likely token + const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); + + // is it an end of stream? + if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) { + LOG_TEE("\n"); + + break; + } + + LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + fflush(stdout); + + // prepare the next batch + batch.n_tokens = 0; - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + // push this new token for next evaluation + batch.token [batch.n_tokens] = new_token_id; + batch.pos [batch.n_tokens] = n_cur; + batch.seq_id[batch.n_tokens] = 0; + batch.logits[batch.n_tokens] = true; - new_token_id = llama_sample_token_greedy(ctx , &candidates_p); + batch.n_tokens += 1; - // is it an end of stream ? - if (new_token_id == llama_token_eos(ctx)) { - fprintf(stderr, " [end of text]\n"); - break; + n_decode += 1; } - // print the new token : - printf("%s", llama_token_to_piece(ctx, new_token_id).c_str()); - fflush(stdout); + n_cur += 1; - // push this new token for next evaluation - tokens_list.push_back(new_token_id); + // evaluate the current batch with the transformer model + if (llama_decode(ctx, batch)) { + fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + return 1; + } } + LOG_TEE("\n"); + + const auto t_main_end = ggml_time_us(); + + LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); + + llama_print_timings(ctx); + + fprintf(stderr, "\n"); + + llama_batch_free(batch); + llama_free(ctx); llama_free_model(model); llama_backend_free(); - fprintf(stderr, "\n\n"); - return 0; } diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index aa904183fa2d8ebc77cd012972085d405e5ae09f..c5e5b234f0f5cc45037c041a69e81912e1c7ce4c 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -37,7 +37,7 @@ int main(int argc, char ** argv) { llama_context * ctx_dft = NULL; // load the target model - params.perplexity = true; // HACK: enable logits_all = true + params.logits_all = true; std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params); // load the draft model @@ -70,16 +70,16 @@ int main(int argc, char ** argv) { const auto t_enc_start = ggml_time_us(); // eval the prompt with both models - llama_eval(ctx_tgt, inp.data(), int(inp.size() - 1), 0, params.n_threads); - llama_eval(ctx_tgt, &inp.back(), 1, inp.size() - 1, params.n_threads); - llama_eval(ctx_dft, inp.data(), int(inp.size()), 0, params.n_threads); + llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0, 0)); + llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0)); + llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0)); const auto t_enc_end = ggml_time_us(); // the 2 models should have the same vocab const int n_ctx = llama_n_ctx(ctx_tgt); - const int n_vocab = llama_n_vocab(ctx_tgt); - //GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft)); + const int n_vocab = llama_n_vocab(model_tgt); + //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft)); // how many tokens to draft each time int n_draft = params.n_draft; @@ -134,7 +134,7 @@ int main(int argc, char ** argv) { while (true) { // sample from the target model - const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft); + llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft); // remember which tokens were sampled - used for repetition penalties during sampling last_tokens.erase(last_tokens.begin()); @@ -172,7 +172,8 @@ int main(int argc, char ** argv) { LOG("out of drafted tokens\n"); } - llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads); + llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, n_ctx); + llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0)); ++n_past_dft; // heuristic for n_draft @@ -256,7 +257,8 @@ int main(int argc, char ** argv) { } // evaluate the drafted token on the draft model - llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads); + llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, n_ctx); + llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0)); ++n_past_cur; if (grammar_dft != NULL) { @@ -265,7 +267,8 @@ int main(int argc, char ** argv) { } // evaluate the target model on the drafted tokens - llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads); + llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_ctx); + llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0)); ++n_past_tgt; // the first token is always proposed by the traget model before the speculation loop diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md index f4ffcd9876c0c7a08c7ac883dbf0498e94280b30..1b3454069e9a38a55fb34eebd1dd25be05a05a57 100644 --- a/examples/train-text-from-scratch/README.md +++ b/examples/train-text-from-scratch/README.md @@ -10,9 +10,9 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s ./bin/train-text-from-scratch \ --vocab-model ../models/ggml-vocab-llama.gguf \ --ctx 64 --embd 256 --head 8 --layer 16 \ - --checkpoint-in chk-shakespeare-256x16.gguf \ - --checkpoint-out chk-shakespeare-256x16.gguf \ - --model-out ggml-shakespeare-256x16-f32.gguf \ + --checkpoint-in chk-shakespeare-256x16-LATEST.gguf \ + --checkpoint-out chk-shakespeare-256x16-ITERATION.gguf \ + --model-out ggml-shakespeare-256x16-f32-ITERATION.gguf \ --train-data "shakespeare.txt" \ -t 6 -b 16 --seed 1 --adam-iter 256 \ --no-checkpointing @@ -20,3 +20,8 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s # predict ./bin/main -m ggml-shakespeare-256x16-f32.gguf ``` + +Output files will be saved every N iterations (config with `--save-every N`). +The pattern "ITERATION" in the output filenames will be replaced with the iteration number and "LATEST" for the latest output. + +To train GGUF models just pass them to `--checkpoint-in FN`. diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py index a527d615304b8abb88d07dc21b84c3b0381040eb..351e7bc2d2a95f2117b574b3d8002c44dfcd4ba3 100644 --- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py +++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py @@ -47,10 +47,13 @@ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys" LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s" LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y" -LLM_KV_TRAINING_FILE_VERSION = "training.file_version" -LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count" -LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count" -LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count" +LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model" +LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora" +LLM_KV_TRAINING_TYPE = "training.type" +LLM_KV_TRAINING_FILE_VERSION = "training.file_version" +LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count" +LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count" +LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count" class Tensor: def __init__(self, dtype='f', ne=None): @@ -460,6 +463,7 @@ class Checkpoint: gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32) gguf_writer.add_layer_norm_rms_eps(1e-5) gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0) + gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_TRAIN_MODEL) gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its) gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples) gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 59c90c7ba654d9c387284bb5f2df48a2d1da0bc5..be693b3ac7a43bc6bdf590b4d089378881f3373f 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1,6 +1,7 @@ #include "ggml.h" #include "ggml-alloc.h" #include "common.h" +#include "train.h" #include "llama.h" #include #include @@ -18,142 +19,7 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -struct random_normal_distribution { - std::mt19937 gen; - std::normal_distribution rd; - float min; - float max; -}; - -struct random_uniform_distribution { - std::mt19937 gen; - std::uniform_real_distribution rd; -}; - -void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) { - rnd->gen = std::mt19937(seed); - rnd->rd = std::normal_distribution{mean, std}; - rnd->min = min; - rnd->max = max; -} - -void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) { - rnd->gen = std::mt19937(seed); - rnd->rd = std::uniform_real_distribution{min, max}; -} - -int clamp(const int v, const int min, const int max) { - return ((v < min) ? (min) : (v > max) ? (max) : v); -} - -float fclamp(const float v, const float min, const float max) { - return ((v < min) ? (min) : (v > max) ? (max) : v); -} - -float frand() { - return (float)rand()/(float)RAND_MAX; -} - -float frand_normal(struct random_normal_distribution * rnd) { - return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max); -} - -float frand_uniform(struct random_uniform_distribution * rnd) { - return rnd->rd(rnd->gen); -} - -struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { - float scale = 1.0f; // xavier - switch (tensor->n_dims) { - case 1: - scale /= sqrtf(tensor->ne[0]); - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); - *dst = scale * frand_normal(rnd); - } - break; - case 2: - scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *dst = scale * frand_normal(rnd); - } - } - break; - case 3: - scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); - *dst = scale * frand_normal(rnd); - } - } - } - break; - case 4: - scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); - for (int i3 = 0; i3 < tensor->ne[3]; i3++) { - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); - *dst = scale * frand_normal(rnd); - } - } - } - } - break; - default: - assert(false); - }; - return tensor; -} - -struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) { - switch (tensor->n_dims) { - case 1: - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); - *dst = frand_uniform(rnd); - } - break; - case 2: - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *dst = frand_uniform(rnd); - } - } - break; - case 3: - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); - *dst = frand_uniform(rnd); - } - } - } - break; - case 4: - for (int i3 = 0; i3 < tensor->ne[3]; i3++) { - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); - *dst = frand_uniform(rnd); - } - } - } - } - break; - default: - assert(false); - }; - return tensor; -} +static const size_t tensor_alignment = 32; struct my_llama_hparams { uint32_t n_vocab = 32000; @@ -164,8 +30,8 @@ struct my_llama_hparams { uint32_t n_rot = 64; uint32_t n_ff = 11008; - // float f_norm_eps = 1e-5; // falcon - float f_norm_rms_eps = 1e-5; // llama + // float f_norm_eps = 1e-5f; // falcon + float f_norm_rms_eps = 1e-5f; // llama float rope_freq_base = 10000.0f; float rope_freq_scale = 1.0f; @@ -192,6 +58,7 @@ struct my_llama_layer { struct my_llama_model { struct ggml_context * ctx = NULL; + std::vector data; my_llama_hparams hparams; @@ -201,92 +68,50 @@ struct my_llama_model { struct ggml_tensor * output; std::vector layers; - - uint32_t train_its = 0; - uint32_t train_samples = 0; - uint32_t train_tokens = 0; }; -// gguf constants -const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type"; -const char * LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"; -const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"; -const char * LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"; -const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"; -const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"; -const char * LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"; -const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"; -const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"; -const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"; -const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"; -const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"; -const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"; -const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"; -const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"; -const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"; -const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"; -const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"; - -const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"; -const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"; -const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"; - -const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"; -const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"; - -const char * LLM_KV_TRAINING_FILE_VERSION = "training.file_version"; -const char * LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"; -const char * LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"; -const char * LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"; - // gguf constants (sync with gguf.py) - -const char * LLM_KV_GENERAL_ARCHITECTURE = "general.architecture"; -const char * LLM_KV_GENERAL_FILE_TYPE = "general.file_type"; - -const char * LLM_KV_CONTEXT_LENGTH = "%s.context_length"; -const char * LLM_KV_EMBEDDING_LENGTH = "%s.embedding_length"; -const char * LLM_KV_BLOCK_COUNT = "%s.block_count"; -const char * LLM_KV_FEED_FORWARD_LENGTH = "%s.feed_forward_length"; -const char * LLM_KV_ATTENTION_HEAD_COUNT = "%s.attention.head_count"; -const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon"; -const char * LLM_KV_ROPE_DIMENSION_COUNT = "%s.rope.dimension_count"; -const char * LLM_KV_ROPE_FREQ_BASE = "%s.rope.freq_base"; // TODO load in llama.cpp -const char * LLM_KV_ROPE_SCALE_LINEAR = "%s.rope.scale_linear"; - -const char * LLM_KV_TOKENIZER_MODEL = "tokenizer.ggml.model"; -const char * LLM_KV_TOKENIZER_LIST = "tokenizer.ggml.tokens"; -const char * LLM_KV_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"; -const char * LLM_KV_TOKENIZER_SCORES = "tokenizer.ggml.scores"; -const char * LLM_KV_TOKENIZER_MERGES = "tokenizer.ggml.merges"; -const char * LLM_KV_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"; -const char * LLM_KV_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"; -const char * LLM_KV_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"; -const char * LLM_KV_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"; -const char * LLM_KV_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"; - -const char * LLM_TENSOR_TOKEN_EMBD = "token_embd"; -const char * LLM_TENSOR_OUTPUT_NORM = "output_norm"; -const char * LLM_TENSOR_OUTPUT = "output"; -const char * LLM_TENSOR_ATTN_NORM = "blk.%d.attn_norm"; -const char * LLM_TENSOR_ATTN_Q = "blk.%d.attn_q"; -const char * LLM_TENSOR_ATTN_K = "blk.%d.attn_k"; -const char * LLM_TENSOR_ATTN_V = "blk.%d.attn_v"; -const char * LLM_TENSOR_ATTN_OUT = "blk.%d.attn_output"; -const char * LLM_TENSOR_FFN_NORM = "blk.%d.ffn_norm"; -const char * LLM_TENSOR_FFN_GATE = "blk.%d.ffn_gate"; -const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down"; -const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; - -void print_params(struct my_llama_hparams * params) { +static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"; +static const char * LLM_KV_TRAINING_TYPE = "training.type"; + +static const char * LLM_KV_GENERAL_ARCHITECTURE = "general.architecture"; +static const char * LLM_KV_GENERAL_FILE_TYPE = "general.file_type"; + +static const char * LLM_KV_CONTEXT_LENGTH = "%s.context_length"; +static const char * LLM_KV_EMBEDDING_LENGTH = "%s.embedding_length"; +static const char * LLM_KV_BLOCK_COUNT = "%s.block_count"; +static const char * LLM_KV_FEED_FORWARD_LENGTH = "%s.feed_forward_length"; +static const char * LLM_KV_ATTENTION_HEAD_COUNT = "%s.attention.head_count"; +static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon"; +static const char * LLM_KV_ROPE_DIMENSION_COUNT = "%s.rope.dimension_count"; +static const char * LLM_KV_ROPE_FREQ_BASE = "%s.rope.freq_base"; // TODO load in llama.cpp +static const char * LLM_KV_ROPE_SCALE_LINEAR = "%s.rope.scale_linear"; + +static const char * LLM_KV_TOKENIZER_MODEL = "tokenizer.ggml.model"; +static const char * LLM_KV_TOKENIZER_LIST = "tokenizer.ggml.tokens"; +static const char * LLM_KV_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"; +static const char * LLM_KV_TOKENIZER_SCORES = "tokenizer.ggml.scores"; +static const char * LLM_KV_TOKENIZER_MERGES = "tokenizer.ggml.merges"; +static const char * LLM_KV_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"; +static const char * LLM_KV_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"; +static const char * LLM_KV_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"; +static const char * LLM_KV_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"; +static const char * LLM_KV_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"; + +static const char * LLM_TENSOR_TOKEN_EMBD = "token_embd"; +static const char * LLM_TENSOR_OUTPUT_NORM = "output_norm"; +static const char * LLM_TENSOR_OUTPUT = "output"; +static const char * LLM_TENSOR_ATTN_NORM = "blk.%d.attn_norm"; +static const char * LLM_TENSOR_ATTN_Q = "blk.%d.attn_q"; +static const char * LLM_TENSOR_ATTN_K = "blk.%d.attn_k"; +static const char * LLM_TENSOR_ATTN_V = "blk.%d.attn_v"; +static const char * LLM_TENSOR_ATTN_OUT = "blk.%d.attn_output"; +static const char * LLM_TENSOR_FFN_NORM = "blk.%d.ffn_norm"; +static const char * LLM_TENSOR_FFN_GATE = "blk.%d.ffn_gate"; +static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down"; +static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; + +static void print_params(struct my_llama_hparams * params) { printf("%s: n_vocab: %d\n", __func__, params->n_vocab); printf("%s: n_ctx: %d\n", __func__, params->n_ctx); printf("%s: n_embd: %d\n", __func__, params->n_embd); @@ -296,7 +121,66 @@ void print_params(struct my_llama_hparams * params) { printf("%s: n_rot: %d\n", __func__, params->n_rot); } -void init_model(struct my_llama_model * model) { +static void set_param_model(struct my_llama_model * model) { + const auto& hparams = model->hparams; + + const uint32_t n_layer = hparams.n_layer; + + struct ggml_context* ctx = model->ctx; + + ggml_set_param(ctx, model->tok_embeddings); + ggml_set_param(ctx, model->norm); + ggml_set_param(ctx, model->output); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + + ggml_set_param(ctx, layer.attention_norm); + ggml_set_param(ctx, layer.wq); + ggml_set_param(ctx, layer.wk); + ggml_set_param(ctx, layer.wv); + ggml_set_param(ctx, layer.wo); + ggml_set_param(ctx, layer.ffn_norm); + ggml_set_param(ctx, layer.w1); + ggml_set_param(ctx, layer.w2); + ggml_set_param(ctx, layer.w3); + } +} + +static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * model) { + ggml_allocr_alloc(alloc, model->tok_embeddings); + ggml_allocr_alloc(alloc, model->norm); + ggml_allocr_alloc(alloc, model->output); + for (uint32_t i = 0; i < model->layers.size(); ++i) { + auto & layer = model->layers[i]; + ggml_allocr_alloc(alloc, layer.attention_norm); + ggml_allocr_alloc(alloc, layer.wq); + ggml_allocr_alloc(alloc, layer.wk); + ggml_allocr_alloc(alloc, layer.wv); + ggml_allocr_alloc(alloc, layer.wo); + ggml_allocr_alloc(alloc, layer.ffn_norm); + ggml_allocr_alloc(alloc, layer.w1); + ggml_allocr_alloc(alloc, layer.w2); + ggml_allocr_alloc(alloc, layer.w3); + } + ggml_allocr_alloc(alloc, model->tok_embeddings->grad); + ggml_allocr_alloc(alloc, model->norm->grad); + ggml_allocr_alloc(alloc, model->output->grad); + for (uint32_t i = 0; i < model->layers.size(); ++i) { + auto & layer = model->layers[i]; + ggml_allocr_alloc(alloc, layer.attention_norm->grad); + ggml_allocr_alloc(alloc, layer.wq->grad); + ggml_allocr_alloc(alloc, layer.wk->grad); + ggml_allocr_alloc(alloc, layer.wv->grad); + ggml_allocr_alloc(alloc, layer.wo->grad); + ggml_allocr_alloc(alloc, layer.ffn_norm->grad); + ggml_allocr_alloc(alloc, layer.w1->grad); + ggml_allocr_alloc(alloc, layer.w2->grad); + ggml_allocr_alloc(alloc, layer.w3->grad); + } +} + +static void init_model(struct my_llama_model * model) { const auto & hparams = model->hparams; const uint32_t n_embd = hparams.n_embd; @@ -304,11 +188,6 @@ void init_model(struct my_llama_model * model) { const uint32_t n_vocab = hparams.n_vocab; const uint32_t n_ff = hparams.n_ff; - struct ggml_context * ctx = model->ctx; - - model->train_its = 0; - model->train_samples = 0; - model->train_tokens = 0; std::vector tn_buf; tn_buf.resize(GGML_MAX_NAME); @@ -323,6 +202,15 @@ void init_model(struct my_llama_model * model) { return tn_buf.data(); }; + // context for model tensors without their data + struct ggml_init_params ctx_model_params; + ctx_model_params.mem_size = ggml_tensor_overhead()*2*(6 + n_layer*18); + ctx_model_params.mem_buffer = NULL; + ctx_model_params.no_alloc = true; + + struct ggml_context * ctx = ggml_init(ctx_model_params); + model->ctx = ctx; + model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); @@ -361,288 +249,53 @@ void init_model(struct my_llama_model * model) { ggml_set_name(layer.w2, tni(LLM_TENSOR_FFN_DOWN, i)); ggml_set_name(layer.w3, tni(LLM_TENSOR_FFN_UP, i)); } -} -void set_param_model(struct my_llama_model * model) { - const auto& hparams = model->hparams; + set_param_model(model); - const uint32_t n_layer = hparams.n_layer; - - struct ggml_context* ctx = model->ctx; + // measure data size + struct ggml_allocr * alloc = NULL; + alloc = ggml_allocr_new_measure(tensor_alignment); + alloc_model(alloc, model); - ggml_set_param(ctx, model->tok_embeddings); - ggml_set_param(ctx, model->norm); - ggml_set_param(ctx, model->output); - - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = model->layers[i]; - - ggml_set_param(ctx, layer.attention_norm); - ggml_set_param(ctx, layer.wq); - ggml_set_param(ctx, layer.wk); - ggml_set_param(ctx, layer.wv); - ggml_set_param(ctx, layer.wo); - ggml_set_param(ctx, layer.ffn_norm); - ggml_set_param(ctx, layer.w1); - ggml_set_param(ctx, layer.w2); - ggml_set_param(ctx, layer.w3); - } + // allocate data + model->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment); + ggml_allocr_free(alloc); + alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment); + alloc_model(alloc, model); + ggml_allocr_free(alloc); } -void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) { +static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) { const auto & hparams = model->hparams; const uint32_t n_layer = hparams.n_layer; - struct random_normal_distribution rnd; - init_random_normal_distribution(&rnd, seed, mean, std, min, max); + struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); - randomize_tensor_normal(model->tok_embeddings, &rnd); - randomize_tensor_normal(model->norm, &rnd); - randomize_tensor_normal(model->output, &rnd); + randomize_tensor_normal(model->tok_embeddings, rnd); + randomize_tensor_normal(model->norm, rnd); + randomize_tensor_normal(model->output, rnd); for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = model->layers[i]; - randomize_tensor_normal(layer.attention_norm, &rnd); + randomize_tensor_normal(layer.attention_norm, rnd); - randomize_tensor_normal(layer.wq, &rnd); - randomize_tensor_normal(layer.wk, &rnd); - randomize_tensor_normal(layer.wv, &rnd); - randomize_tensor_normal(layer.wo, &rnd); + randomize_tensor_normal(layer.wq, rnd); + randomize_tensor_normal(layer.wk, rnd); + randomize_tensor_normal(layer.wv, rnd); + randomize_tensor_normal(layer.wo, rnd); - randomize_tensor_normal(layer.ffn_norm, &rnd); + randomize_tensor_normal(layer.ffn_norm, rnd); - randomize_tensor_normal(layer.w1, &rnd); - randomize_tensor_normal(layer.w2, &rnd); - randomize_tensor_normal(layer.w3, &rnd); + randomize_tensor_normal(layer.w1, rnd); + randomize_tensor_normal(layer.w2, rnd); + randomize_tensor_normal(layer.w3, rnd); } -} - -void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { - GGML_ASSERT(tensor->n_dims == 1); - GGML_ASSERT(tensor->ne[0] == ne0); -} - -void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { - GGML_ASSERT(tensor->n_dims == 2); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); -} -void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { - GGML_ASSERT(tensor->n_dims == 3); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); - GGML_ASSERT(tensor->ne[2] == ne2); + free_random_normal_distribution(rnd); } -void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { - GGML_ASSERT(tensor->n_dims == 4); - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); - GGML_ASSERT(tensor->ne[2] == ne2); - GGML_ASSERT(tensor->ne[3] == ne3); -} - -static size_t hash(void * p) { - return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE; -} - -static size_t hash_find(void * hash_table[], void * p) { - size_t h = hash(p); - - // linear probing - size_t i = h; - while (hash_table[i] != NULL && hash_table[i] != p) { - i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE; - if (i == h) { - // visited all hash table entries -> not found - return GGML_GRAPH_HASHTABLE_SIZE; - } - } - return i; -} - -static bool hash_insert(void * hash_table[], void * p) { - //size_t h = hash(p); - size_t i = hash_find(hash_table, p); - - GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full - - if (hash_table[i] == p) { - return true; - } - - // insert - GGML_ASSERT(hash_table[i] == NULL); - hash_table[i] = p; - return false; -} - -static bool hash_contains(void * hash_table[], void * p) { - size_t i = hash_find(hash_table, p); - return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p); -} - -struct hash_map { - void * keys[GGML_GRAPH_HASHTABLE_SIZE]; - void * vals[GGML_GRAPH_HASHTABLE_SIZE]; -}; -//static const size_t HASH_MAP_SIZE = sizeof(struct hash_map); - -struct hash_map * new_hash_map() { - struct hash_map * result = new struct hash_map; - for (int i=0; ikeys[i] = NULL; - result->vals[i] = NULL; - } - return result; -}; - -void free_hash_map(struct hash_map * map) { - delete map; -} - -static bool ggml_is_view(struct ggml_tensor * t) { - return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE || - t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY; -} - -static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) { - switch (t->op) { - case GGML_OP_PERMUTE: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - case GGML_OP_VIEW: - return t->src[0]; - case GGML_OP_CPY: - return t->src[1]; - default: - return NULL; - } -} - -static struct ggml_tensor * get_view_source(struct ggml_tensor * t) { - struct ggml_tensor * parent = t; - do { - parent = get_view_parent(parent); - } while (ggml_is_view(parent)); - return parent; -} - -struct ggml_tensor * ggml_recompute_graph_node( - struct ggml_context * ctx, - struct ggml_cgraph * graph, - struct hash_map * replacements, - struct ggml_tensor * node) { - - if (node == NULL) { - return NULL; - } - - if (node->is_param) { - return node; - } - - if (!hash_contains(graph->visited_hash_table, node)) { - return node; - } - - int count_children = 0; - for (int k = 0; k < GGML_MAX_SRC; ++k) { - if (node->src[k]) { - ++count_children; - } - } - - if (count_children == 0) { - return node; - } - - size_t i = hash_find(replacements->keys, node); - GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full - if (replacements->keys[i] == node) { - return (struct ggml_tensor *) replacements->vals[i]; - } - - struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne); - - // insert clone into replacements - GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite - replacements->keys[i] = node; - replacements->vals[i] = clone; - - clone->op = node->op; - clone->grad = node->grad; - clone->is_param = node->is_param; - clone->extra = node->extra; - for (int k = 0; k < GGML_MAX_DIMS; ++k) { - clone->nb[k] = node->nb[k]; - } - for (int k = 0; k < GGML_MAX_SRC; ++k) { - clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]); - } - if (ggml_is_view(clone)) { - struct ggml_tensor * source = get_view_source(clone); - GGML_ASSERT(source != NULL); - clone->data = source->data; - } - - GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t))); - GGML_ASSERT(sizeof(node->name) == GGML_MAX_NAME); - memcpy(clone->op_params, node->op_params, sizeof(node->op_params)); - ggml_format_name(clone, "%s (clone)", ggml_get_name(node)); - - return clone; -}; - -void ggml_build_backward_gradient_checkpointing( - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct ggml_cgraph * gb, - struct ggml_cgraph * gb_tmp, - struct ggml_tensor * * checkpoints, - int n_checkpoints) { - *gb_tmp = *gf; - ggml_build_backward_expand(ctx, gf, gb_tmp, true); - - if (n_checkpoints <= 0) { - *gb = *gb_tmp; - return; - } - - struct hash_map * replacements = new_hash_map(); - - // insert checkpoints in replacements - for (int i = 0; i < n_checkpoints; ++i) { - size_t k = hash_find(replacements->keys, checkpoints[i]); - GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full - GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite - replacements->keys[k] = checkpoints[i]; - replacements->vals[k] = checkpoints[i]; - } - - *gb = *gf; - // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes], - // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]), - // by recomputing them from checkpoints - for (int i = gf->n_nodes; in_nodes; ++i) { - struct ggml_tensor * node = gb_tmp->nodes[i]; - for (int k = 0; k < GGML_MAX_SRC; ++k) { - // insert new tensors recomputing src, reusing already made replacements, - // remember replacements: remember new tensors with mapping from corresponding gf nodes - // recurse for input tensors, - // unless (i.e. terminating when) input tensors are checkpoints - node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]); - } - // insert rewritten backward node with replacements made into resulting backward graph gb - ggml_build_forward_expand(gb, node); - } - - free_hash_map(replacements); -} - -struct ggml_tensor * llama_build_train_graphs( +static struct ggml_tensor * llama_build_train_graphs( struct my_llama_model * model, struct ggml_allocr * alloc, struct ggml_context * ctx, @@ -679,15 +332,24 @@ struct ggml_tensor * llama_build_train_graphs( } }; + // KQ_pos - contains the positions + struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); + ggml_allocr_alloc(alloc, KQ_pos); + if (!ggml_allocr_is_measure(alloc)) { + int * data = (int *) KQ_pos->data; + for (int i = 0; i < N; ++i) { + data[i] = n_past + i; + } + } + // rope has so much parameters that we make a custom function for it - auto rope = [ctx, n_rot, n_ctx, rope_freq_base, rope_freq_scale] + auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale] (struct ggml_tensor * t) -> struct ggml_tensor * { // not capturing these, to silcence warnings - const int n_past = 0; const int rope_mode = 0; return ggml_rope_custom(ctx, - t, n_past, n_rot, rope_mode, n_ctx, + t, KQ_pos, n_rot, rope_mode, n_ctx, rope_freq_base, rope_freq_scale); }; @@ -706,7 +368,7 @@ struct ggml_tensor * llama_build_train_graphs( checkpoints.push_back(t00); checkpoints.push_back(t01); - struct ggml_tensor * kv_scale; + struct ggml_tensor * kv_scale = NULL; if (!enable_flash_attn) { kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head)); } @@ -787,21 +449,16 @@ struct ggml_tensor * llama_build_train_graphs( ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one)); // input gradient ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one)); - GGML_ASSERT(t36->grad->data == NULL && !ggml_is_view(t36->grad)); + // KQ_pos + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one)); + GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); + ggml_allocr_alloc(alloc, t36->grad); - // gradient tensors (will be set to zero by ggml_graph_reset) - // pinning these produces large unnecessary memory overhead, which will be resolved by PR 2632 - for (int i = 0; i < gf->n_nodes; ++i) { - if (!gf->grads[i]) continue; - if (gf->grads[i]->data == NULL && !ggml_is_view(gf->grads[i])) { - ggml_allocr_alloc(alloc, gf->grads[i]); - } - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, gf->grads[i], one)); - } + // allocating checkpoints in one block to reduce memory fragmentation // note: they will be freed in reverse order for (int i = 0; i < (int) checkpoints.size(); ++i) { - if (checkpoints[i]->data == NULL && !ggml_is_view(checkpoints[i])) { + if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) { ggml_allocr_alloc(alloc, checkpoints[i]); } } @@ -826,196 +483,8 @@ struct ggml_tensor * llama_build_train_graphs( return t36; } -void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) { - float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); - *ptr = value; -} - -void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) { - float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *ptr = value; -} - -void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) { - int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *ptr = value; -} - -float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { - float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - return *ptr; -} - -int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { - int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - return *ptr; -} - -void print_row(struct ggml_tensor * probs, int i) { - for (int k = 0; k < probs->ne[0]; ++k) { - float p = get_f32_2d(probs, k, i); - printf(" %.2f", p); - } - printf("\n"); -} - -void print_matrix(struct ggml_tensor * probs) { - assert(probs->n_dims == 2); - for (int i = 0; i < probs->ne[1]; ++i) { - for (int k = 0; k < probs->ne[0]; ++k) { - float p = get_f32_2d(probs, k, i); - printf(" %.2f", p); - } - printf("\n"); - } -} - -void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { - int n_tokens = tokens_input->ne[0]; - int n_vocab = target_logits->ne[0]; - - size_t sample = train_samples[example_id % n_train_samples]; - GGML_ASSERT(sample+n_tokens-1 < n_train_data); - - ggml_set_f32(target_logits, -1.0f/n_vocab); - ggml_set_f32(target_probs, 0.0f); - ggml_set_i32_1d(tokens_input, 0, llama_token_bos(lctx)); - for (int i=1; in_dims == 2); - GGML_ASSERT(target_logits->n_dims == 3); - GGML_ASSERT(target_probs->n_dims == 3); - int n_vocab = target_logits->ne[0]; - int n_tokens = tokens_input->ne[0]; - int n_batch = tokens_input->ne[1]; - GGML_ASSERT(n_tokens == target_logits->ne[1]); - GGML_ASSERT(n_batch == target_logits->ne[2]); - GGML_ASSERT(n_vocab == target_probs->ne[0]); - GGML_ASSERT(n_tokens == target_probs->ne[1]); - GGML_ASSERT(n_batch == target_probs->ne[2]); - - ggml_set_f32(target_logits, -1.0f/n_vocab); - ggml_set_f32(target_probs, 0.0f); - // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples); - for (int k=0; k& out) { - FILE * fp = std::fopen(filename, "rb"); - if (fp == NULL) { - return 0; - } - -#ifdef _WIN32 - GGML_ASSERT(_fseeki64(fp, (__int64) 0, SEEK_END) == 0); -#else - GGML_ASSERT(std::fseek(fp, (long) 0, SEEK_END) == 0); -#endif - - size_t size = 0; -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); - size = ret; -#else - long ret = std::ftell(fp); - size = ret; -#endif - -#ifdef _WIN32 - GGML_ASSERT(_fseeki64(fp, (__int64) 0, SEEK_SET) == 0); -#else - GGML_ASSERT(std::fseek(fp, (long) 0, SEEK_SET) == 0); -#endif - - std::vector buf; - buf.resize(size+1); - out.resize(size+1); - - if (std::fread(buf.data(), size, 1, fp) != 1) { - die("unexpectedly reached end of file"); - } - if (ferror(fp)) { - die_fmt("fread failed: %s", strerror(errno)); - } - - buf[size] = '\0'; - - int n_tokens = llama_tokenize(lctx, buf.data(), buf.size(), out.data(), out.size(), false); - if (n_tokens < 0) { - out.resize(-n_tokens); - n_tokens = llama_tokenize(lctx, buf.data(), buf.size(), out.data(), out.size(), false); - } - GGML_ASSERT(n_tokens >= 0); - out.resize(n_tokens); - - bool verify = false; - if (verify) { - const char * in = buf.data(); - const char * end = buf.data() + buf.size(); - for (int i = 0; i < (int) out.size(); ++i) { - std::string s = llama_token_to_piece(lctx, out[i]); - int len = s.length(); - if (in >= end) { - printf("%s: unexpected end of original text.\n", __func__); - break; - } - const bool matches = (strncmp(in, s.c_str(), len) == 0); - if (matches) { - in += len; - } else { - printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str()); - } - } - } - - return n_tokens; -} - -void shuffle_ints(int * begin, int * end) { - if (end <= begin) return; - int max=begin[0]; - for (int i=1; i max) { - max = begin[i]; - } - } - std::vector vals; - vals.resize(max+1); - for (int i=0; i= 0) { \ @@ -1027,161 +496,9 @@ void shuffle_ints(int * begin, int * end) { } else if (req) { \ die_fmt("key not found in model: %s", skey.c_str()); \ } \ -} - - -bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) { - GGML_ASSERT(a != NULL); - GGML_ASSERT(b != NULL); - GGML_ASSERT(a->type == b->type); - GGML_ASSERT(ggml_are_same_shape(a, b)); - GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b)); - - return true; -} +} while (0) -void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) { - if (dst == NULL) { - return; - } - struct ggml_tensor * t = ggml_get_tensor(ctx, name); - GGML_ASSERT(are_same_layout(dst, t)); - memcpy(dst->data, t->data, ggml_nbytes(t)); - - if (strlen(ggml_get_name(dst)) == 0) { - ggml_set_name(dst, name); - } -} - -void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) { - // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read - - uint32_t file_version; - GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION); - GGML_ASSERT(file_version == 0); - - GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT); - GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT); - GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED); - - uint64_t nx; - GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT); - opt->nx = (size_t) nx; - - // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know - - std::string opt_type; - GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE); - if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) { - opt->params.type = GGML_OPT_ADAM; - - GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS); - GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS); - GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT); - - GGML_ASSERT(opt->ctx != NULL); - ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); - - read_tensor_by_name(opt->adam.m, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); - read_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); - read_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); - } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) { - opt->params.type = GGML_OPT_LBFGS; - - GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT); - GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS); - GGUF_GET_KEY(fctx, opt->lbfgs.step, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP); - GGUF_GET_KEY(fctx, opt->lbfgs.j, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J); - GGUF_GET_KEY(fctx, opt->lbfgs.k, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K); - GGUF_GET_KEY(fctx, opt->lbfgs.end, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END); - GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT); - - GGML_ASSERT(opt->ctx != NULL); - ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); - - read_tensor_by_name(opt->lbfgs.x, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); - read_tensor_by_name(opt->lbfgs.xp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); - read_tensor_by_name(opt->lbfgs.g, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); - read_tensor_by_name(opt->lbfgs.gp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); - read_tensor_by_name(opt->lbfgs.d, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); - read_tensor_by_name(opt->lbfgs.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); - read_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); - read_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); - read_tensor_by_name(opt->lbfgs.lms, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); - read_tensor_by_name(opt->lbfgs.lmy, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); - } else { - die("unknown optimizer type"); - } -} - -void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) { - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past); - gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter); - gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized); - - switch (opt->params.type) { - case GGML_OPT_ADAM: - { - gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, opt->adam.fx_prev); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement); - - ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); - ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); - if (opt->adam.pf) { - ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); - } - - gguf_add_tensor(fctx, opt->adam.m); - gguf_add_tensor(fctx, opt->adam.v); - if (opt->adam.pf) { - gguf_add_tensor(fctx, opt->adam.pf); - } - } break; - case GGML_OPT_LBFGS: - { - gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, opt->lbfgs.fx_best); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, opt->lbfgs.step); - gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, opt->lbfgs.j); - gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, opt->lbfgs.k); - gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, opt->lbfgs.end); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement); - - ggml_set_name(opt->lbfgs.x, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); - ggml_set_name(opt->lbfgs.xp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); - ggml_set_name(opt->lbfgs.g, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); - ggml_set_name(opt->lbfgs.gp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); - ggml_set_name(opt->lbfgs.d, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); - if (opt->lbfgs.pf) { - ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); - } - ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); - ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); - ggml_set_name(opt->lbfgs.lms, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); - ggml_set_name(opt->lbfgs.lmy, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); - - gguf_add_tensor(fctx, opt->lbfgs.x); - gguf_add_tensor(fctx, opt->lbfgs.xp); - gguf_add_tensor(fctx, opt->lbfgs.g); - gguf_add_tensor(fctx, opt->lbfgs.gp); - gguf_add_tensor(fctx, opt->lbfgs.d); - if (opt->lbfgs.pf) { - gguf_add_tensor(fctx, opt->lbfgs.pf); - } - gguf_add_tensor(fctx, opt->lbfgs.lmal); - gguf_add_tensor(fctx, opt->lbfgs.lmys); - gguf_add_tensor(fctx, opt->lbfgs.lms); - gguf_add_tensor(fctx, opt->lbfgs.lmy); - } break; - } -} - -void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) { +static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) { // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read std::string arch; @@ -1233,26 +550,26 @@ void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_g init_model(model); - read_tensor_by_name(model->tok_embeddings, f_ggml_ctx, tn(LLM_TENSOR_TOKEN_EMBD)); - read_tensor_by_name(model->norm, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM)); - read_tensor_by_name(model->output, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT)); + copy_tensor_by_name(model->tok_embeddings, f_ggml_ctx, tn(LLM_TENSOR_TOKEN_EMBD)); + copy_tensor_by_name(model->norm, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM)); + copy_tensor_by_name(model->output, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT)); for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { auto & layer = model->layers[i]; - read_tensor_by_name(layer.attention_norm, f_ggml_ctx, tni(LLM_TENSOR_ATTN_NORM, i)); - read_tensor_by_name(layer.wq, f_ggml_ctx, tni(LLM_TENSOR_ATTN_Q, i)); - read_tensor_by_name(layer.wk, f_ggml_ctx, tni(LLM_TENSOR_ATTN_K, i)); - read_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i)); - read_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i)); - read_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i)); - read_tensor_by_name(layer.w1, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i)); - read_tensor_by_name(layer.w2, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i)); - read_tensor_by_name(layer.w3, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i)); + copy_tensor_by_name(layer.attention_norm, f_ggml_ctx, tni(LLM_TENSOR_ATTN_NORM, i)); + copy_tensor_by_name(layer.wq, f_ggml_ctx, tni(LLM_TENSOR_ATTN_Q, i)); + copy_tensor_by_name(layer.wk, f_ggml_ctx, tni(LLM_TENSOR_ATTN_K, i)); + copy_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i)); + copy_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i)); + copy_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i)); + copy_tensor_by_name(layer.w1, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i)); + copy_tensor_by_name(layer.w2, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i)); + copy_tensor_by_name(layer.w3, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i)); } } -void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) { +static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) { const char * arch = "llama"; enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32; @@ -1395,7 +712,8 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod } } -void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model) { +static void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model) { + printf("%s: saving to %s\n", __func__, filename); struct gguf_context * fctx = gguf_init_empty(); save_llama_model_gguf(fctx, fn_vocab_model, model); @@ -1406,32 +724,24 @@ void save_llama_model_file(const char * filename, const char * fn_vocab_model, s gguf_free(fctx); } -void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct ggml_opt_context * opt) { +static void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct train_state * train) { load_llama_model_gguf(fctx, f_ggml_ctx, model); - - uint32_t file_version; - GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION); - GGML_ASSERT(file_version == 0); - - GGUF_GET_KEY(fctx, model->train_its, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT); - GGUF_GET_KEY(fctx, model->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT); - GGUF_GET_KEY(fctx, model->train_tokens, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT); - - load_opt_context_gguf(fctx, f_ggml_ctx, opt); + if (load_train_state_gguf(fctx, f_ggml_ctx, train)) { + std::string train_type = LLM_KV_TRAINING_TYPE_TRAIN_MODEL; + GGUF_GET_KEY(fctx, train_type, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE); + GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_TRAIN_MODEL); + } else { + printf("%s: loaded llama model as checkpoint\n", __func__); + } } -void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) { +static void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct train_state * train) { + gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_TRAIN_MODEL); save_llama_model_gguf(fctx, fn_vocab_model, model); - - gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION, 0); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_ITERATION_COUNT, model->train_its); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_SAMPLE_COUNT, model->train_samples); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_TOKEN_COUNT, model->train_tokens); - - save_opt_context_gguf(fctx, opt); + save_train_state_gguf(fctx, train); } -bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct ggml_opt_context * opt) { +static bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct train_state * train) { struct ggml_context * f_ggml_ctx; struct gguf_init_params params; params.no_alloc = false; @@ -1441,15 +751,16 @@ bool load_checkpoint_file(const char * filename, struct my_llama_model * model, return false; } - load_checkpoint_gguf(fctx, f_ggml_ctx, model, opt); + load_checkpoint_gguf(fctx, f_ggml_ctx, model, train); return true; } -void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) { +static void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct train_state * train) { + printf("%s: saving to %s\n", __func__, filename); struct gguf_context * fctx = gguf_init_empty(); - save_checkpoint_gguf(fctx, fn_vocab_model, model, opt); + save_checkpoint_gguf(fctx, fn_vocab_model, model, train); // write file const bool only_meta = false; @@ -1457,33 +768,13 @@ void save_checkpoint_file(const char * filename, const char * fn_vocab_model, st gguf_free(fctx); } -float cosine_decay(const int decay_steps, const float minimum, int step) { - if (step > decay_steps) { - step = decay_steps; - } - const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps)); - const float decay = (1 - minimum)*cosine_decay + minimum; - return decay; -} - -float cosine_decay_restart(int decay_steps, const float minimum, int step, float restart_step_mult, bool enable_restart) { - if (enable_restart) { - while (step > decay_steps) { - step -= decay_steps; - decay_steps = (int) restart_step_mult * decay_steps; - } - } - return cosine_decay(decay_steps, minimum, step); -} - struct train_params { + struct train_params_common common; + const char * fn_vocab_model; - const char * fn_train_data; - const char * fn_checkpoint_in; - const char * fn_checkpoint_out; const char * fn_model_out; - uint32_t seed; + bool only_write_model; int n_ctx; int n_embd; @@ -1491,58 +782,18 @@ struct train_params { int n_layer; int n_ff; - int n_threads; - int n_batch; - int n_examples; - float f_norm_rms_eps; float rope_freq_base; float rope_freq_scale; - - int print_info_interval; - - bool samples_start_after_nl; - bool use_adam; - bool use_flash; - bool use_checkpointing; - bool use_alloc; - - // only adam - int warmup; - int cos_decay_steps; - float cos_decay_restart; - float cos_decay_min; - bool enable_restart; - - int opt_past; - float opt_delta; - int opt_max_no_improvement; - - int lbfgs_n_iter; - int adam_n_iter; - float adam_alpha; - float adam_min_alpha; - float adam_decay; - int adam_decay_min_ndim; - float adam_beta1; - float adam_beta2; - float adam_gclip; - float adam_eps_f; - - int mem_model_gb; - int mem_compute_gb; - int mem_compute0_gb; }; -struct train_params get_default_train_params() { +static struct train_params get_default_train_params() { struct train_params params; + params.common = get_default_train_params_common(); params.fn_vocab_model = "ggml-vic7b-uncensored-q4_0.bin"; - params.fn_train_data = "shakespeare.txt"; - params.fn_checkpoint_in = "checkpoint.bin"; - params.fn_checkpoint_out = "checkpoint.bin"; params.fn_model_out = "ggml-checkpoint-f32.bin"; - params.seed = -1; + params.only_write_model = false; params.n_ctx = 128; params.n_embd = 256; @@ -1550,62 +801,22 @@ struct train_params get_default_train_params() { params.n_layer = 16; params.n_ff = 768; - params.n_threads = 6; - params.n_batch = 8; - params.n_examples = 1; - - params.f_norm_rms_eps = 1e-5; + params.f_norm_rms_eps = 1e-5f; params.rope_freq_base = 10000.0f; params.rope_freq_scale = 1.0f; - params.print_info_interval = 1; - - params.samples_start_after_nl = false; - params.use_adam = true; - params.use_flash = true; - params.use_checkpointing = true; - params.use_alloc = true; - - params.opt_past = 0; - params.opt_delta = 1e-5f; - params.opt_max_no_improvement = 0; - - // only adam - params.warmup = 100; - params.cos_decay_steps = 1000; - params.cos_decay_restart = 1.1f; - params.cos_decay_min = 0.1f; - params.enable_restart = false; - - params.lbfgs_n_iter = 256; - params.adam_n_iter = 256; - params.adam_alpha = 1e-3f; - params.adam_min_alpha = 0; - params.adam_decay = 1e-1f; - params.adam_decay_min_ndim = 2; - params.adam_beta1 = 0.9f; - params.adam_beta2 = 0.999f; - params.adam_gclip = 1.0f; - params.adam_eps_f = 0.0f; - - params.mem_model_gb = 2; - params.mem_compute_gb = 24; - params.mem_compute0_gb = 8; return params; } -void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) { +static void train_print_usage(int argc, char ** argv, const struct train_params * params) { fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " --vocab-model FNAME model path from which to load vocab (default '%s')\n", params->fn_vocab_model); - fprintf(stderr, " --train-data FNAME path from which to load training data (default '%s')\n", params->fn_train_data); - fprintf(stderr, " --checkpoint-in FNAME path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in); - fprintf(stderr, " --checkpoint-out FNAME path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out); fprintf(stderr, " --model-out FNAME path to save ggml model (default '%s')\n", params->fn_model_out); - fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for -1)\n"); - fprintf(stderr, " -c N, --ctx N Context size used during training (default %d)\n", params->n_ctx); + fprintf(stderr, " --only-write-model only save llama model, don't do any training. use this if you only want to convert a checkpoint to a model.\n"); fprintf(stderr, " --embd N Embedding size used for new models (default %d)\n", params->n_embd); fprintf(stderr, " --ff N Feedforward size used for new models. (default %d)\n", params->n_ff); fprintf(stderr, " --head N Number of heads for new models (default %d)\n", params->n_head); @@ -1613,45 +824,11 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p fprintf(stderr, " --norm-rms-eps F RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps); fprintf(stderr, " --rope-freq-base F Frequency base for ROPE (default %f)\n", params->rope_freq_base); fprintf(stderr, " --rope-freq-scale F Frequency scale for ROPE (default %f)\n", params->rope_freq_scale); - fprintf(stderr, " -t N, --threads N Number of threads (default %d)\n", params->n_threads); - fprintf(stderr, " -b N, --batch N Parallel batch size (default %d)\n", params->n_batch); - fprintf(stderr, " -n N, --examples N Number of examples to train (default %d)\n", params->n_examples); - fprintf(stderr, " --print-info-interval N Print infos during training each N examples (default %d)\n", params->print_info_interval); - fprintf(stderr, " --samples-after-nl Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off"); - fprintf(stderr, " --use-lbfgs Use LBFGS optimizer instead of default Adam\n"); - fprintf(stderr, " --use-adam Use Adam optimizer (default)\n"); - fprintf(stderr, " --no-flash Don't use flash attention \n"); - fprintf(stderr, " --use-flash Use flash attention (default)\n"); - fprintf(stderr, " --no-checkpointing Don't use gradient checkpointing\n"); - fprintf(stderr, " --use-checkpointing Use gradient checkpointing (default)\n"); - fprintf(stderr, " --no-alloc Don't use allocator\n"); - fprintf(stderr, " --use-alloc Use allocator (default)\n"); - fprintf(stderr, " --warmup N Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup); - fprintf(stderr, " --cos-decay-steps N Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps); - fprintf(stderr, " --cos-decay-restart N Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart); - fprintf(stderr, " --cos-decay-min N Only for Adam optimizer. Cosine decay minimum (default %f)\n", params->cos_decay_min); - fprintf(stderr, " --enable-restart N Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : ""); - fprintf(stderr, " --disable-restart N Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : ""); - fprintf(stderr, " --opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past); - fprintf(stderr, " --opt-delta N Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta); - fprintf(stderr, " --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement); - fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f); - fprintf(stderr, " --adam-iter N Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter); - fprintf(stderr, " --adam-alpha N Adam learning rate alpha (default %f)\n", params->adam_alpha); - fprintf(stderr, " --adam-min-alpha N Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha); - fprintf(stderr, " --adam-decay N AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay); - fprintf(stderr, " --adam-decay-min-ndim N Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim); - fprintf(stderr, " --adam-beta1 N AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1); - fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2); - fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip); - fprintf(stderr, " --lbfgs-iter N Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter); - fprintf(stderr, " --mem-model N Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb); - fprintf(stderr, " --mem-compute N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb); - fprintf(stderr, " --mem-compute0 N Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb); - fprintf(stderr, "\n"); + + print_common_train_usage(argc, argv, ¶ms->common); } -bool train_params_parse(int argc, char ** argv, struct train_params * params) { +static bool train_params_parse(int argc, char ** argv, struct train_params * params) { bool invalid_param = false; std::string arg; struct train_params default_params = get_default_train_params(); @@ -1663,48 +840,27 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { std::replace(arg.begin(), arg.end(), '_', '-'); } - if (arg == "--vocab-model") { - if (++i >= argc) { - invalid_param = true; + if (consume_common_train_arg(argc, argv, &i, ¶ms->common, &invalid_param)) { + if (invalid_param) { break; + } else if (params->common.print_usage) { + train_print_usage(argc, argv, &default_params); + exit(0); } - params->fn_vocab_model = argv[i]; - } else if (arg == "--train-data") { + } else if (arg == "--vocab-model") { if (++i >= argc) { invalid_param = true; break; } - params->fn_train_data = argv[i]; - } else if (arg == "--checkpoint-in") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_checkpoint_in = argv[i]; - } else if (arg == "--checkpoint-out") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_checkpoint_out = argv[i]; + params->fn_vocab_model = argv[i]; } else if (arg == "--model-out") { if (++i >= argc) { invalid_param = true; break; } params->fn_model_out = argv[i]; - } else if (arg == "-s" || arg == "--seed") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->seed = std::stoi(argv[i]); - } else if (arg == "-c" || arg == "--ctx") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_ctx = std::stoi(argv[i]); + } else if (arg == "--only-write-model") { + params->only_write_model = true; } else if (arg == "--embd") { if (++i >= argc) { invalid_param = true; @@ -1747,175 +903,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { break; } params->rope_freq_scale = std::stof(argv[i]); - } else if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_threads = std::stoi(argv[i]); - } else if (arg == "-b" || arg == "--batch") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_batch = std::stoi(argv[i]); - } else if (arg == "-n" || arg == "--examples") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_examples = std::stoi(argv[i]); - } else if (arg == "--print-info-interval") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->print_info_interval = std::stoi(argv[i]); - } else if (arg == "--samples-after-nl") { - params->samples_start_after_nl = true; - } else if (arg == "--use-lbfgs") { - params->use_adam = false; - } else if (arg == "--use-adam") { - params->use_adam = true; - } else if (arg == "--no-flash") { - params->use_flash = false; - } else if (arg == "--use-flash") { - params->use_flash = true; - } else if (arg == "--no-checkpointing") { - params->use_checkpointing = false; - } else if (arg == "--use-checkpointing") { - params->use_checkpointing = true; - } else if (arg == "--no-alloc") { - params->use_alloc = false; - } else if (arg == "--use-alloc") { - params->use_alloc = true; - } else if (arg == "--warmup") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->warmup = std::stoi(argv[i]); - } else if (arg == "--cos-decay-steps") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->cos_decay_steps = std::stof(argv[i]); - } else if (arg == "--cos-decay-restart") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->cos_decay_restart = std::stof(argv[i]); - } else if (arg == "--cos-decay-min") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->cos_decay_min = std::stof(argv[i]); - } else if (arg == "--enable-restart") { - params->enable_restart = true; - } else if (arg == "--disable-restart") { - params->enable_restart = false; - } else if (arg == "--opt-past") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->opt_past = std::stoi(argv[i]); - } else if (arg == "--opt-delta") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->opt_delta = std::stof(argv[i]); - } else if (arg == "--opt-max-no-improvement") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->opt_max_no_improvement = std::stoi(argv[i]); - } else if (arg == "--adam-epsf") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_eps_f = std::stof(argv[i]); - } else if (arg == "--adam-iter") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_n_iter = std::stoi(argv[i]); - } else if (arg == "--adam-alpha") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_alpha = std::stof(argv[i]); - } else if (arg == "--adam-min-alpha") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_min_alpha = std::stof(argv[i]); - } else if (arg == "--adam-decay") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_decay = std::stof(argv[i]); - } else if (arg == "--adam-decay-min-ndim") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_decay_min_ndim = std::stoi(argv[i]); - } else if (arg == "--adam-beta1") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_beta1 = std::stof(argv[i]); - } else if (arg == "--adam-beta2") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_beta2 = std::stof(argv[i]); - } else if (arg == "--adam-gclip") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->adam_gclip = std::stof(argv[i]); - } else if (arg == "--lbfgs-iter") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->lbfgs_n_iter = std::stoi(argv[i]); - } else if (arg == "--mem-model") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->mem_model_gb = std::stoi(argv[i]); - } else if (arg == "--mem-compute") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->mem_compute_gb = std::stoi(argv[i]); - } else if (arg == "--mem-compute0") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->mem_compute0_gb = std::stoi(argv[i]); - } else if (arg == "-h" || arg == "--help") { - train_print_usage(argc, argv, &default_params); - exit(0); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); train_print_usage(argc, argv, &default_params); @@ -1927,65 +914,54 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { train_print_usage(argc, argv, &default_params); exit(1); } + finish_processing_train_args(¶ms->common); return true; } -struct opt_callback_data { - struct train_params * params; - struct ggml_opt_context * opt; - struct llama_context * lctx; - llama_token * tokens_data; - size_t tokens_size; - int * samples_data; - size_t samples_size; - int shuffle_countdown; - struct ggml_tensor * tokens_input; - struct ggml_tensor * target_logits; - struct ggml_tensor * target_probs; +struct save_train_files_data { + const char * fn_checkpoint_out; + const char * fn_model_out; + const char * fn_vocab_model; + const char * pattern_fn_it; + const char * fn_latest; + struct my_llama_model * model; }; -void opt_callback(void * vdata, float * sched) { - struct opt_callback_data * data = (struct opt_callback_data *) vdata; - struct train_params * params = data->params; - struct ggml_opt_context * opt = data->opt; - int n_batch = params->n_batch; - - *sched = (opt->iter < params->warmup) - ? (float) opt->iter / (float) params->warmup - : cosine_decay_restart( - params->cos_decay_steps, - params->cos_decay_min, - opt->iter - params->warmup, - params->cos_decay_restart, - params->enable_restart); - float min_sched = params->adam_min_alpha / params->adam_alpha; - *sched = min_sched + *sched * (1.0f - min_sched); - - int impr_plot = std::isnan(opt->loss_after) ? 0 : -std::lround(1 + (opt->loss_before - opt->loss_after) * 10.0f); - printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0); - - if (data->shuffle_countdown < n_batch) { - printf("%s: reshuffle samples\n", __func__); - shuffle_ints(data->samples_data, data->samples_data + data->samples_size); - for (int i = 0; i < (int) data->samples_size; ++i) { - GGML_ASSERT(data->samples_data[i]+params->n_ctx-1 < (int) data->tokens_size); - } - data->shuffle_countdown = data->samples_size; +static void save_train_files(void * vdata, struct train_state * train) { + struct save_train_files_data * data = (struct save_train_files_data *) vdata; + int64_t iter = train->opt->iter; + + if (strlen(data->fn_checkpoint_out) > 0) { + save_checkpoint_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->fn_vocab_model, data->model, train); + save_checkpoint_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->fn_vocab_model, data->model, train); + } + if (strlen(data->fn_model_out) > 0) { + save_llama_model_file(get_train_filename(data->fn_model_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->fn_vocab_model, data->model); + save_llama_model_file(get_train_filename(data->fn_model_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->fn_vocab_model, data->model); + } +} - get_example_targets_batch( - data->lctx, - data->samples_data, - data->samples_size, - data->tokens_data, - data->tokens_size, - opt->iter, - data->tokens_input, - data->target_logits, - data->target_probs); - - data->shuffle_countdown -= n_batch; +static int64_t get_parameter_count(struct my_llama_model* model) { + int64_t nx = 0; + nx += ggml_nelements(model->tok_embeddings); + nx += ggml_nelements(model->norm); + nx += ggml_nelements(model->output); + + for (uint32_t i = 0; i < model->layers.size(); ++i) { + auto & layer = model->layers[i]; + nx += ggml_nelements(layer.attention_norm); + nx += ggml_nelements(layer.wq); + nx += ggml_nelements(layer.wk); + nx += ggml_nelements(layer.wv); + nx += ggml_nelements(layer.wo); + nx += ggml_nelements(layer.ffn_norm); + nx += ggml_nelements(layer.w1); + nx += ggml_nelements(layer.w2); + nx += ggml_nelements(layer.w3); + } + return nx; } int main(int argc, char ** argv) { @@ -1995,28 +971,23 @@ int main(int argc, char ** argv) { return 1; } - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); + if (params.common.seed == LLAMA_DEFAULT_SEED) { + params.common.seed = time(NULL); } - printf("%s: seed: %u\n", __func__, params.seed); - srand(params.seed); + printf("%s: seed: %u\n", __func__, params.common.seed); + srand(params.common.seed); - struct llama_context_params llama_params = llama_context_default_params(); - llama_params.vocab_only = true; + struct llama_model_params mparams = llama_model_default_params(); + mparams.vocab_only = true; - struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params); - struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params); + struct llama_context_params cparams = llama_context_default_params(); - printf("%s: tokenize training data\n", __func__); - std::vector train_tokens; - if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) { - fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, params.fn_train_data); - } - printf("%s: number of training tokens: %d\n", __func__, (int) train_tokens.size()); + struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, mparams); + struct llama_context * lctx = llama_new_context_with_model(lmodel, cparams); struct my_llama_model model; - model.hparams.n_vocab = llama_n_vocab(lctx); - model.hparams.n_ctx = params.n_ctx; + model.hparams.n_vocab = llama_n_vocab(lmodel); + model.hparams.n_ctx = params.common.n_ctx; model.hparams.n_embd = params.n_embd; model.hparams.n_head = params.n_head; model.hparams.n_layer = params.n_layer; @@ -2027,243 +998,311 @@ int main(int argc, char ** argv) { model.hparams.rope_freq_base = params.rope_freq_base; model.hparams.rope_freq_scale = params.rope_freq_scale; - print_params(&model.hparams); - - std::vector token_noccurs; - std::vector token_notavail; - token_noccurs.resize(model.hparams.n_vocab, 0); - token_notavail.resize(model.hparams.n_vocab, true); - for (int i = 0; i < (int) train_tokens.size(); ++i) { - ++token_noccurs[train_tokens[i]]; - token_notavail[train_tokens[i]] = false; - } - - std::vector token_freq; - token_freq.resize(model.hparams.n_vocab, 0); - int n_unique_tokens = 0; - for (int i = 0; i < (int) token_noccurs.size(); ++i) { - token_freq[i] = (float) token_noccurs[i] / (float) train_tokens.size(); - n_unique_tokens += (token_noccurs[i] > 0) ? 1 : 0; - } - printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens); + struct train_state * train = init_train_state(); + struct ggml_opt_context * opt = train->opt; + + // set opt params from command line + opt->params = ggml_opt_default_params(GGML_OPT_ADAM); + opt->params.print_forward_graph = false; + opt->params.print_backward_graph = false; + opt->params.n_threads = params.common.n_threads; + opt->params.past = params.common.opt_past; + opt->params.delta = params.common.opt_delta; + opt->params.max_no_improvement = params.common.opt_max_no_improvement; + opt->params.n_gradient_accumulation = params.common.n_gradient_accumulation; + opt->params.adam.n_iter = params.common.adam_n_iter; + opt->params.adam.sched = 1.0f; + opt->params.adam.alpha = params.common.adam_alpha; + opt->params.adam.decay = params.common.adam_decay; + opt->params.adam.decay_min_ndim = params.common.adam_decay_min_ndim; + opt->params.adam.beta1 = params.common.adam_beta1; + opt->params.adam.beta2 = params.common.adam_beta2; + opt->params.adam.gclip = params.common.adam_gclip; + opt->params.adam.eps_f = params.common.adam_eps_f; - struct ggml_init_params lcparams; - lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb); - lcparams.mem_buffer = NULL; - lcparams.no_alloc = false; + printf("%s: init model\n", __func__); + bool existed = load_checkpoint_file(params.common.fn_checkpoint_in, &model, train); + if (existed) { + // overwrite last n_ctx with user provided n_ctx + if (params.common.custom_n_ctx) { + model.hparams.n_ctx = params.common.n_ctx; + } - model.ctx = ggml_init(lcparams); + const bool opt_past_changed = opt->params.past != params.common.opt_past; - int n_tokens = model.hparams.n_ctx; - int n_vocab = model.hparams.n_vocab; - int n_batch = params.n_batch; - - struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context)); - memset(opt, 0, sizeof(struct ggml_opt_context)); - - struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM); - struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); - opt_params_adam.print_forward_graph = false; - opt_params_adam.print_backward_graph = false; - opt_params_adam.n_threads = params.n_threads; - opt_params_adam.past = params.opt_past; - opt_params_adam.delta = params.opt_delta; - opt_params_adam.max_no_improvement = params.opt_max_no_improvement; - opt_params_adam.adam.n_iter = params.adam_n_iter; - opt_params_adam.adam.sched = 1.0f; - opt_params_adam.adam.alpha = params.adam_alpha; - opt_params_adam.adam.decay = params.adam_decay; - opt_params_adam.adam.decay_min_ndim = params.adam_decay_min_ndim; - opt_params_adam.adam.beta1 = params.adam_beta1; - opt_params_adam.adam.beta2 = params.adam_beta2; - opt_params_adam.adam.gclip = params.adam_gclip; - opt_params_adam.adam.eps_f = params.adam_eps_f; - - opt_params_lbfgs.print_forward_graph = false; - opt_params_lbfgs.print_backward_graph = false; - opt_params_lbfgs.n_threads = params.n_threads; - opt_params_adam.past = params.opt_past; - opt_params_adam.delta = params.opt_delta; - opt_params_adam.max_no_improvement = params.opt_max_no_improvement; - opt_params_lbfgs.lbfgs.n_iter = params.lbfgs_n_iter; - - opt->ctx = model.ctx; - opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs; - - printf("%s: init model\n", __func__); - bool existed = load_checkpoint_file(params.fn_checkpoint_in, &model, opt); - if (!existed) { + if (opt_past_changed) { + die("Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value train from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting"); + // need to discard previous optimizer past function value statistics and opt_init with new shapes + // TODO + } + } else { init_model(&model); + randomize_model(&model, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f); + if (!params.only_write_model) { + ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&model)); + } } - set_param_model(&model); - - opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs; + opt->iter = train->train_its; - opt->iter = model.train_its; - printf("%s: opt iter %d\n", __func__, opt->iter); - - bool from_scratch = !existed; - if (from_scratch) { - randomize_model(&model, params.seed, 0.0f, 1.0f, -1.0f, +1.0f); + print_params(&model.hparams); + printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) train->train_its); + printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples); + printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens); + printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs); + printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + model.data.size()), (float) (ggml_used_mem(model.ctx) + model.data.size()) / (1024.0f*1024.0f)); + + if (params.only_write_model) { + save_train_files_data save_data; + save_data.fn_checkpoint_out = ""; + save_data.fn_model_out = params.fn_model_out; + save_data.fn_vocab_model = params.fn_vocab_model; + save_data.pattern_fn_it = params.common.pattern_fn_it; + save_data.fn_latest = params.common.fn_latest; + save_data.model = &model; + + save_train_files(&save_data, train); + + free_train_state(train); + ggml_free(model.ctx); + llama_free(lctx); + llama_free_model(lmodel); + return 0; } - printf("used_mem model: %zu bytes\n", ggml_used_mem(model.ctx)); - // ggml_print_tensor_objects(model.ctx); + printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f)); + printf("%s: opt iter %d\n", __func__, opt->iter); - // TODO: use std::vector intead of "new" - size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb); - uint8_t * compute_addr = new uint8_t[compute_size]; + int n_tokens = model.hparams.n_ctx; + int n_vocab = model.hparams.n_vocab; + int n_batch = params.common.n_batch; - size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb); - uint8_t * compute_buf_0 = new uint8_t[size_buf_0]; + std::vector mem_input_data; + std::vector mem_compute_data; ggml_allocr * alloc = NULL; - if (params.use_alloc) { - static const size_t tensor_alignment = 32; - alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment); - } - - GGML_ASSERT(n_tokens < (int) train_tokens.size()); - std::vector train_samples; - train_samples.push_back(0); - for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) { - if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx))) { - train_samples.push_back(i); - } - } - shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size()); - for (int i = 0; i < (int) train_samples.size(); ++i) { - GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size()); - } - - printf("%s: begin training\n", __func__); - - struct opt_callback_data opt_cb_data; - opt_cb_data.params = ¶ms; - opt_cb_data.opt = opt; - opt_cb_data.lctx = lctx; - opt_cb_data.tokens_data = train_tokens.data(); - opt_cb_data.tokens_size = train_tokens.size(); - opt_cb_data.samples_data = train_samples.data(); - opt_cb_data.samples_size = train_samples.size(); - opt_cb_data.shuffle_countdown = train_samples.size(); - opt_cb_data.tokens_input = NULL; - opt_cb_data.target_logits = NULL; - opt_cb_data.target_probs = NULL; - - int64_t t0 = ggml_time_ms(); - - for (int ex = 0; ex < params.n_examples; ++ex) { - if (ex*n_batch >= (int) train_samples.size()) { - shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size()); - for (int i = 0; i < (int) train_samples.size(); ++i) { - GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size()); - } - } - - struct ggml_init_params cparams = { - compute_size, // mem_size - compute_addr, // mem_buffer - false, // no_alloc - }; - struct ggml_context * ctx0 = ggml_init(cparams); - - ggml_set_no_alloc(ctx0, false); - - // don't use alloc for input tensors, so we can safely fill them with data - //struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch); - //struct ggml_tensor * after_opt_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); - struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch); - struct ggml_tensor * target_logits = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); - struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); - - ggml_set_no_alloc(ctx0, (alloc != NULL)); - if (alloc) { - ggml_allocr_reset(alloc); - } - - opt_cb_data.tokens_input = tokens_input; - opt_cb_data.target_logits = target_logits; - opt_cb_data.target_probs = target_probs; - - int n_past = 0; - - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - struct ggml_cgraph * gb = ggml_new_graph(ctx0); - struct ggml_cgraph * gb_tmp = params.use_checkpointing - ? ggml_new_graph(ctx0) + // context for input tensors without their data + struct ggml_init_params ctx_input_params = { + ggml_tensor_overhead() * 2, // mem_size + NULL, // mem_buffer + true, // no_alloc + }; + struct ggml_context * ctx_input = ggml_init(ctx_input_params); + + // the input tensors + struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch); + struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); + + // measure required memory for input tensors + alloc = ggml_allocr_new_measure(tensor_alignment); + ggml_allocr_alloc(alloc, tokens_input); + ggml_allocr_alloc(alloc, target_probs); + size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment; + ggml_allocr_free(alloc); + printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); + + // allocate input tensors + mem_input_data.resize(max_input_size); + alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment); + ggml_allocr_alloc(alloc, tokens_input); + ggml_allocr_alloc(alloc, target_probs); + ggml_allocr_free(alloc); + + // context for compute tensors without their data + size_t estimated_compute_size_wo_data = ( + ggml_tensor_overhead()*GGML_MAX_NODES*2 + + (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*( + params.common.use_checkpointing ? 3 : 2 + ) + ); + struct ggml_init_params ctx_compute_params = { + estimated_compute_size_wo_data, // mem_size + NULL, // mem_buffer + true, // no_alloc + }; + struct ggml_context * ctx_compute = NULL; + + struct ggml_tensor * loss = NULL; + struct ggml_tensor * logits = NULL; + + struct ggml_cgraph * gf = NULL; + struct ggml_cgraph * gb = NULL; + struct ggml_cgraph * gb_tmp = NULL; + + // measure required memory for compute tensors + size_t best_compute_size = SIZE_MAX; + enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT; + // find best evaluation order + for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { + ctx_compute = ggml_init(ctx_compute_params); + alloc = ggml_allocr_new_measure(tensor_alignment); + gf = ggml_new_graph(ctx_compute); + gf->order = (enum ggml_cgraph_eval_order) order; + gb = ggml_new_graph(ctx_compute); + gb_tmp = params.common.use_checkpointing + ? ggml_new_graph(ctx_compute) : NULL; - - GGML_ASSERT(n_past == 0); - - struct ggml_tensor * loss = NULL; - struct ggml_tensor * logits = NULL; - loss = llama_build_train_graphs( - &model, alloc, ctx0, + &model, alloc, ctx_compute, gf, gb, gb_tmp, &logits, tokens_input, target_probs, n_tokens, n_batch, - params.use_flash, - params.use_checkpointing + params.common.use_flash, + params.common.use_checkpointing ); + size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment; + if (max_compute_size < best_compute_size) { + best_compute_size = max_compute_size; + best_order = gf->order; + } + ggml_allocr_free(alloc); + ggml_free(ctx_compute); + } + size_t max_compute_size = best_compute_size; + printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f)); + printf("%s: evaluation order = %s\n", __func__, + (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" : + (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" : + "invalid"); + + // allocate compute tensors + mem_compute_data.resize(max_compute_size); + ctx_compute = ggml_init(ctx_compute_params); + alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); + gf = ggml_new_graph(ctx_compute); + gf->order = best_order; + gb = ggml_new_graph(ctx_compute); + gb_tmp = params.common.use_checkpointing + ? ggml_new_graph(ctx_compute) + : NULL; + loss = llama_build_train_graphs( + &model, alloc, ctx_compute, + gf, gb, gb_tmp, + &logits, tokens_input, target_probs, + n_tokens, n_batch, + params.common.use_flash, + params.common.use_checkpointing + ); + ggml_allocr_free(alloc); - size_t used_mem_before_opt = ggml_used_mem(ctx0); - - opt->params.adam.sched = (opt->iter < params.warmup) - ? (float) opt->iter / (float) params.warmup - : cosine_decay_restart( - params.cos_decay_steps, - params.cos_decay_min, - opt->iter - params.warmup, - params.cos_decay_restart, - params.enable_restart); - - float min_sched = params.adam_min_alpha / params.adam_alpha; - opt->params.adam.sched = min_sched + opt->params.adam.sched * (1.0f - min_sched); - - printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched); - - ggml_opt_resume_g(ctx0, opt, loss, gf, gb, &opt_callback, (void *) &opt_cb_data); + std::vector train_tokens; + std::vector train_samples_begin; + std::vector train_samples_size; + printf("%s: tokenize training data\n", __func__); + tokenize_file(lctx, + params.common.fn_train_data, + params.common.sample_start, + params.common.include_sample_start, + params.common.overlapping_samples, + n_tokens, + train_tokens, + train_samples_begin, + train_samples_size); + GGML_ASSERT(train_samples_begin.size() == train_samples_size.size()); + + printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size()); + + size_t shuffle_samples_hash = compute_samples_hash(params.common.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size()); + const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size()); + if (changed_train_data) { + printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__); + } + if (params.common.force_reshuffle) { + printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__); + } + if ((train->shuffle_rng_state_current == "") || changed_train_data || params.common.force_reshuffle) { + train->shuffle_rng_state_current = mt19937_seed_to_state(params.common.seed); + train->shuffle_sample_count = train_samples_size.size(); + train->shuffle_next_sample = 0; + train->shuffle_samples_hash = shuffle_samples_hash; + } + std::vector train_shuffled_samples_offs; + std::vector train_shuffled_samples_begin; + std::vector train_shuffled_samples_size; + train_shuffled_samples_offs.resize(train_samples_begin.size()); + train_shuffled_samples_begin.resize(train_samples_begin.size()); + train_shuffled_samples_size.resize(train_samples_size.size()); + train->shuffle_rng_state_next = shuffle_samples( + train->shuffle_rng_state_current, + train_shuffled_samples_offs.data(), + train_shuffled_samples_begin.data(), + train_shuffled_samples_size.data(), + train_samples_begin.data(), + train_samples_size.data(), + train_samples_size.size()); + printf("%s: begin training\n", __func__); - size_t used_mem_after_opt = ggml_used_mem(ctx0); + save_train_files_data save_data; + save_data.fn_checkpoint_out = params.common.fn_checkpoint_out; + save_data.fn_model_out = params.fn_model_out; + save_data.fn_vocab_model = params.fn_vocab_model; + save_data.pattern_fn_it = params.common.pattern_fn_it; + save_data.fn_latest = params.common.fn_latest; + save_data.model = &model; + + struct train_opt_callback_data opt_cb_data; + opt_cb_data.params = ¶ms.common; + opt_cb_data.train = train; + opt_cb_data.save_cb = &save_train_files; + opt_cb_data.save_data = &save_data; + opt_cb_data.lctx = lctx; + opt_cb_data.last_save_iter = opt->iter; + opt_cb_data.tokens_data = train_tokens.data(); + opt_cb_data.tokens_size = train_tokens.size(); + opt_cb_data.samples_begin = train_samples_begin.data(); + opt_cb_data.samples_size = train_samples_size.data(); + opt_cb_data.shuffled_samples_offs = train_shuffled_samples_offs.data(); + opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data(); + opt_cb_data.shuffled_samples_size = train_shuffled_samples_size.data(); + opt_cb_data.samples_count = train_samples_size.size(); + opt_cb_data.tokens_input = tokens_input; + opt_cb_data.target_probs = target_probs; + opt_cb_data.first_iter = opt->iter; + opt_cb_data.first_epoch = train->train_epochs; + opt_cb_data.iter_at_last_epoch = -1; + opt_cb_data.last_time = ggml_time_ms(); + opt_cb_data.millis_per_iter = 0.0; + + // measure required memory for work buffer + size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE; + printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f)); + + // context for work buffer + struct ggml_init_params ctx_work_params = { + max_work_size, // mem_size + NULL, // mem_buffer + false, // no_alloc + }; + struct ggml_context * ctx_work = ggml_init(ctx_work_params); - int n_iter = params.use_adam ? params.adam_n_iter : params.lbfgs_n_iter; - model.train_its = opt->iter; - model.train_samples += n_batch * n_iter; - model.train_tokens += n_batch * n_tokens * n_iter; + int64_t t0 = ggml_time_ms(); - if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) { - printf("Example %d, opt iter %d\n", ex, opt->iter); - printf("error_before_opt: %.6f\n", opt->loss_before); - printf("error_after_opt: %.6f\n", opt->loss_after); - printf("used_mem_before_opt: %zu bytes\n", used_mem_before_opt); - printf("used_mem_after_opt: %zu bytes\n", used_mem_after_opt); - } + ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data); - ggml_free(ctx0); - } + ggml_free(ctx_work); + ggml_free(ctx_compute); + ggml_free(ctx_input); int64_t t1 = ggml_time_ms(); - int64_t d = t1-t0; - double dd = (double) d * 1e-3; - printf("%s: total training time=%f seconds\n", __func__, dd); + printf("%s: total training time: ", __func__); + print_duration((double) (t1 - t0)); + printf("\n"); - if (params.n_examples > 0) { - save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, opt); - } + int new_iters = opt->iter - opt_cb_data.last_save_iter; + if (new_iters > 0) { + train->train_its += new_iters; + train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens; - if (strlen(params.fn_model_out) > 0) { - save_llama_model_file(params.fn_model_out, params.fn_vocab_model, &model); + save_train_files(&save_data, train); + opt_cb_data.last_save_iter = opt->iter; } if (alloc) { ggml_allocr_free(alloc); } - delete[] compute_addr; - delete[] compute_buf_0; + ggml_free(opt->ctx); + free_train_state(train); ggml_free(model.ctx); llama_free(lctx); llama_free_model(lmodel); diff --git a/expose.h b/expose.h index 535e1137416951748ed12b2f76024be8d03850bb..8791a1915945f0099154ead25138e223eccaafbd 100644 --- a/expose.h +++ b/expose.h @@ -1,7 +1,7 @@ #pragma once -const int stop_token_max = 10; -const int ban_token_max = 10; +const int stop_token_max = 16; +const int ban_token_max = 16; const int tensor_split_max = 16; // match kobold's sampler list and order enum samplers @@ -73,6 +73,7 @@ struct generation_inputs const char * stop_sequence[stop_token_max]; const bool stream_sse; const char * grammar; + const bool grammar_retain_state; }; struct generation_outputs { diff --git a/ggml-alloc.c b/ggml-alloc.c index ed9f19cfdc5de8c7fa1df193df2ff26dc1cea690..805759db74fef6f8175e1ab47dfe711b640da100 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -187,6 +187,7 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) } tensor->data = addr; + AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data); #ifdef GGML_ALLOCATOR_DEBUG add_allocated_tensor(alloc, tensor); @@ -218,7 +219,8 @@ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tens size_t size = ggml_allocr_get_alloc_size(alloc, tensor); size = aligned_offset(NULL, size, alloc->alignment); - AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks); + AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks); + AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size); #ifdef GGML_ALLOCATOR_DEBUG remove_allocated_tensor(alloc, tensor); @@ -631,3 +633,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n( size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) { return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL); } + +size_t ggml_allocr_max_size(struct ggml_allocr * alloc) { + return alloc->max_size; +} diff --git a/ggml-alloc.h b/ggml-alloc.h index 9559da75871a608fb29f08edebf860674295e043..0c224f174f396eb187ddc5a48f955f7c0ad4418e 100644 --- a/ggml-alloc.h +++ b/ggml-alloc.h @@ -19,6 +19,7 @@ GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc); GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc); GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor); GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph); +GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc); #ifdef __cplusplus diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 78ba94f9a121949985810ba405399d4c6e93c158..321af5e1a02e3e7e7dd77b1a8d72199a7c7858fa 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1,3 +1,4 @@ +#include #include #include #include @@ -14,9 +15,11 @@ // for rocblas_initialize() #include "rocblas/rocblas.h" #endif // __HIP_PLATFORM_AMD__ +#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT #define CUBLAS_OP_N HIPBLAS_OP_N #define CUBLAS_OP_T HIPBLAS_OP_T #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS @@ -77,9 +80,9 @@ #include "ggml.h" #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products -#define CC_TURING 700 +#define CC_VOLTA 700 #define CC_OFFSET_AMD 1000000 -#define CC_RDNA2 CC_OFFSET_AMD + 1030 +#define CC_RDNA2 (CC_OFFSET_AMD + 1030) #if defined(GGML_USE_HIPBLAS) #define __CUDA_ARCH__ 1300 @@ -235,8 +238,12 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment } +template +using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream); +typedef to_t_cuda_t to_fp32_cuda_t; +typedef to_t_cuda_t to_fp16_cuda_t; + typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); -typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream); typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v); typedef void (*cpy_kernel_t)(const char * cx, char * cdst); typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); @@ -461,7 +468,7 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; static bool g_mul_mat_q = true; static void * g_scratch_buffer = nullptr; -static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default +static size_t g_scratch_size = 0; // disabled by default static size_t g_scratch_offset = 0; static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; @@ -708,7 +715,8 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in //================================== k-quants -static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) { +template +static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { const int i = blockIdx.x; const block_q2_K * x = (const block_q2_K *) vx; @@ -720,7 +728,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float const int is = 8*n + l/16; const uint8_t q = x[i].qs[32*n + l]; - float * y = yy + i*QK_K + 128*n; + dst_t * y = yy + i*QK_K + 128*n; float dall = __low2half(x[i].dm); float dmin = __high2half(x[i].dm); @@ -732,7 +740,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float const int is = tid/16; // 0 or 1 const int il = tid%16; // 0...15 const uint8_t q = x[i].qs[il] >> (2*is); - float * y = yy + i*QK_K + 16*is + il; + dst_t * y = yy + i*QK_K + 16*is + il; float dall = __low2half(x[i].dm); float dmin = __high2half(x[i].dm); y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); @@ -741,7 +749,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float } -static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) { +template +static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { const int i = blockIdx.x; const block_q3_K * x = (const block_q3_K *) vx; @@ -765,7 +774,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float float d_all = x[i].d; float dl = d_all * (us - 32); - float * y = yy + i*QK_K + 128*n + 32*j; + dst_t * y = yy + i*QK_K + 128*n + 32*j; const uint8_t * q = x[i].qs + 32*n; const uint8_t * hm = x[i].hmask; @@ -777,7 +786,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float const int im = il/8; // 0...1 const int in = il%8; // 0...7 - float * y = yy + i*QK_K + 16*is + il; + dst_t * y = yy + i*QK_K + 16*is + il; const uint8_t q = x[i].qs[il] >> (2*is); const uint8_t h = x[i].hmask[in] >> (2*is + im); @@ -805,7 +814,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t } #endif -static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) { +template +static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { const block_q4_K * x = (const block_q4_K *) vx; const int i = blockIdx.x; @@ -818,7 +828,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float const int is = 2*il; const int n = 4; - float * y = yy + i*QK_K + 64*il + n*ir; + dst_t * y = yy + i*QK_K + 64*il + n*ir; const float dall = __low2half(x[i].dm); const float dmin = __high2half(x[i].dm); @@ -837,7 +847,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float #else const int tid = threadIdx.x; const uint8_t * q = x[i].qs; - float * y = yy + i*QK_K; + dst_t * y = yy + i*QK_K; const float d = (float)x[i].dm[0]; const float m = (float)x[i].dm[1]; y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); @@ -845,7 +855,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float #endif } -static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) { +template +static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { const block_q5_K * x = (const block_q5_K *) vx; const int i = blockIdx.x; @@ -857,7 +868,7 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float const int ir = tid%16; // ir is in 0...15 const int is = 2*il; // is is in 0...6 - float * y = yy + i*QK_K + 64*il + 2*ir; + dst_t * y = yy + i*QK_K + 64*il + 2*ir; const float dall = __low2half(x[i].dm); const float dmin = __high2half(x[i].dm); @@ -885,13 +896,14 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float const int is = tid/16; // 0 or 1 const uint8_t h = x[i].qh[in] >> im; const float d = x[i].d; - float * y = yy + i*QK_K + tid; + dst_t * y = yy + i*QK_K + tid; y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16)); y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16)); #endif } -static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) { +template +static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { const block_q6_K * x = (const block_q6_K *) vx; const int i = blockIdx.x; @@ -903,7 +915,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float const int il = tid - 32*ip; // 0...32 const int is = 8*ip + il/16; - float * y = yy + i*QK_K + 128*ip + il; + dst_t * y = yy + i*QK_K + 128*ip + il; const float d = x[i].d; @@ -922,7 +934,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float const int ip = tid/16; // 0 or 1 const int il = tid - 16*ip; // 0...15 - float * y = yy + i*QK_K + 16*ip + il; + dst_t * y = yy + i*QK_K + 16*ip + il; const float d = x[i].d; @@ -1515,6 +1527,14 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs, v.y = x[ib + iqs + 1]; } +static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){ + const float * x = (const float *) vx; + + // automatic half -> float type cast if dfloat == float + v.x = x[ib + iqs + 0]; + v.y = x[ib + iqs + 1]; +} + static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) { const int ix = blockDim.x*blockIdx.x + threadIdx.x; @@ -1554,8 +1574,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest reinterpret_cast(y[ib].ds.y) = sum; } -template -static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) { +template +static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) { const int i = blockDim.x*blockIdx.x + 2*threadIdx.x; if (i >= k) { @@ -3533,7 +3553,7 @@ template static __global__ void load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#elif __CUDA_ARCH__ >= CC_TURING +#elif __CUDA_ARCH__ >= CC_VOLTA const int mmq_x = MMQ_X_Q4_0_AMPERE; const int mmq_y = MMQ_Y_Q4_0_AMPERE; const int nwarps = NWARPS_Q4_0_AMPERE; @@ -3553,7 +3573,7 @@ template static __global__ void #else (void) vec_dot_q4_0_q8_1_mul_mat; assert(false); -#endif // __CUDA_ARCH__ >= CC_TURING +#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q4_1_RDNA2 64 @@ -3574,9 +3594,9 @@ template static __global__ void #if defined(RDNA3) || defined(RDNA2) __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2) #endif // defined(RDNA3) || defined(RDNA2) -#elif __CUDA_ARCH__ < CC_TURING +#elif __CUDA_ARCH__ < CC_VOLTA __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2) -#endif // __CUDA_ARCH__ < CC_TURING +#endif // __CUDA_ARCH__ < CC_VOLTA mul_mat_q4_1( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { @@ -3596,7 +3616,7 @@ template static __global__ void load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#elif __CUDA_ARCH__ >= CC_TURING +#elif __CUDA_ARCH__ >= CC_VOLTA const int mmq_x = MMQ_X_Q4_1_AMPERE; const int mmq_y = MMQ_Y_Q4_1_AMPERE; const int nwarps = NWARPS_Q4_1_AMPERE; @@ -3616,7 +3636,7 @@ template static __global__ void #else (void) vec_dot_q4_1_q8_1_mul_mat; assert(false); -#endif // __CUDA_ARCH__ >= CC_TURING +#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q5_0_RDNA2 64 @@ -3657,7 +3677,7 @@ template static __global__ void load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#elif __CUDA_ARCH__ >= CC_TURING +#elif __CUDA_ARCH__ >= CC_VOLTA const int mmq_x = MMQ_X_Q5_0_AMPERE; const int mmq_y = MMQ_Y_Q5_0_AMPERE; const int nwarps = NWARPS_Q5_0_AMPERE; @@ -3677,7 +3697,7 @@ template static __global__ void #else (void) vec_dot_q5_0_q8_1_mul_mat; assert(false); -#endif // __CUDA_ARCH__ >= CC_TURING +#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q5_1_RDNA2 64 @@ -3718,7 +3738,7 @@ mul_mat_q5_1( load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#elif __CUDA_ARCH__ >= CC_TURING +#elif __CUDA_ARCH__ >= CC_VOLTA const int mmq_x = MMQ_X_Q5_1_AMPERE; const int mmq_y = MMQ_Y_Q5_1_AMPERE; const int nwarps = NWARPS_Q5_1_AMPERE; @@ -3738,7 +3758,7 @@ mul_mat_q5_1( #else (void) vec_dot_q5_1_q8_1_mul_mat; assert(false); -#endif // __CUDA_ARCH__ >= CC_TURING +#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q8_0_RDNA2 64 @@ -3779,7 +3799,7 @@ template static __global__ void load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#elif __CUDA_ARCH__ >= CC_TURING +#elif __CUDA_ARCH__ >= CC_VOLTA const int mmq_x = MMQ_X_Q8_0_AMPERE; const int mmq_y = MMQ_Y_Q8_0_AMPERE; const int nwarps = NWARPS_Q8_0_AMPERE; @@ -3799,7 +3819,7 @@ template static __global__ void #else (void) vec_dot_q8_0_q8_1_mul_mat; assert(false); -#endif // __CUDA_ARCH__ >= CC_TURING +#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q2_K_RDNA2 64 @@ -3840,7 +3860,7 @@ mul_mat_q2_K( load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#elif __CUDA_ARCH__ >= CC_TURING +#elif __CUDA_ARCH__ >= CC_VOLTA const int mmq_x = MMQ_X_Q2_K_AMPERE; const int mmq_y = MMQ_Y_Q2_K_AMPERE; const int nwarps = NWARPS_Q2_K_AMPERE; @@ -3860,7 +3880,7 @@ mul_mat_q2_K( #else (void) vec_dot_q2_K_q8_1_mul_mat; assert(false); -#endif // __CUDA_ARCH__ >= CC_TURING +#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q3_K_RDNA2 128 @@ -3881,9 +3901,9 @@ template static __global__ void #if defined(RDNA3) || defined(RDNA2) __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2) #endif // defined(RDNA3) || defined(RDNA2) -#elif __CUDA_ARCH__ < CC_TURING +#elif __CUDA_ARCH__ < CC_VOLTA __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2) -#endif // __CUDA_ARCH__ < CC_TURING +#endif // __CUDA_ARCH__ < CC_VOLTA mul_mat_q3_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { @@ -3903,7 +3923,7 @@ template static __global__ void load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#elif __CUDA_ARCH__ >= CC_TURING +#elif __CUDA_ARCH__ >= CC_VOLTA const int mmq_x = MMQ_X_Q3_K_AMPERE; const int mmq_y = MMQ_Y_Q3_K_AMPERE; const int nwarps = NWARPS_Q3_K_AMPERE; @@ -3923,7 +3943,7 @@ template static __global__ void #else (void) vec_dot_q3_K_q8_1_mul_mat; assert(false); -#endif // __CUDA_ARCH__ >= CC_TURING +#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q4_K_RDNA2 64 @@ -3944,9 +3964,9 @@ template static __global__ void #if defined(RDNA3) || defined(RDNA2) __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2) #endif // defined(RDNA3) || defined(RDNA2) -#elif __CUDA_ARCH__ < CC_TURING +#elif __CUDA_ARCH__ < CC_VOLTA __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2) -#endif // __CUDA_ARCH__ < CC_TURING +#endif // __CUDA_ARCH__ < CC_VOLTA mul_mat_q4_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { @@ -3966,7 +3986,7 @@ template static __global__ void load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#elif __CUDA_ARCH__ >= CC_TURING +#elif __CUDA_ARCH__ >= CC_VOLTA const int mmq_x = MMQ_X_Q4_K_AMPERE; const int mmq_y = MMQ_Y_Q4_K_AMPERE; const int nwarps = NWARPS_Q4_K_AMPERE; @@ -3986,7 +4006,7 @@ template static __global__ void #else (void) vec_dot_q4_K_q8_1_mul_mat; assert(false); -#endif // __CUDA_ARCH__ >= CC_TURING +#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q5_K_RDNA2 64 @@ -4027,7 +4047,7 @@ mul_mat_q5_K( load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#elif __CUDA_ARCH__ >= CC_TURING +#elif __CUDA_ARCH__ >= CC_VOLTA const int mmq_x = MMQ_X_Q5_K_AMPERE; const int mmq_y = MMQ_Y_Q5_K_AMPERE; const int nwarps = NWARPS_Q5_K_AMPERE; @@ -4047,7 +4067,7 @@ mul_mat_q5_K( #else (void) vec_dot_q5_K_q8_1_mul_mat; assert(false); -#endif // __CUDA_ARCH__ >= CC_TURING +#endif // __CUDA_ARCH__ >= CC_VOLTA } #define MMQ_X_Q6_K_RDNA2 64 @@ -4068,9 +4088,9 @@ template static __global__ void #if defined(RDNA3) || defined(RDNA2) __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2) #endif // defined(RDNA3) || defined(RDNA2) -#elif __CUDA_ARCH__ < CC_TURING +#elif __CUDA_ARCH__ < CC_VOLTA __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2) -#endif // __CUDA_ARCH__ < CC_TURING +#endif // __CUDA_ARCH__ < CC_VOLTA mul_mat_q6_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { @@ -4090,7 +4110,7 @@ template static __global__ void load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#elif __CUDA_ARCH__ >= CC_TURING +#elif __CUDA_ARCH__ >= CC_VOLTA const int mmq_x = MMQ_X_Q6_K_AMPERE; const int mmq_y = MMQ_Y_Q6_K_AMPERE; const int nwarps = NWARPS_Q6_K_AMPERE; @@ -4110,7 +4130,7 @@ template static __global__ void #else (void) vec_dot_q6_K_q8_1_mul_mat; assert(false); -#endif // __CUDA_ARCH__ >= CC_TURING +#endif // __CUDA_ARCH__ >= CC_VOLTA } template @@ -4355,8 +4375,10 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne, } // rope == RoPE == rotary positional embedding -static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0, - const float p_delta, const int p_delta_rows, const float theta_scale) { + +template +static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale, + const int p_delta_rows, const float theta_scale) { const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (col >= ncols) { @@ -4365,8 +4387,11 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c const int row = blockDim.x*blockIdx.x + threadIdx.x; const int i = row*ncols + col; + const int i2 = row/p_delta_rows; - const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2); + const int p = has_pos ? pos[i2] : 0; + const float p0 = p*freq_scale; + const float theta = p0*powf(theta_scale, col/2); const float sin_theta = sinf(theta); const float cos_theta = cosf(theta); @@ -4377,8 +4402,9 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c dst[i + 1] = x0*sin_theta + x1*cos_theta; } -static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0, - const float p_delta, const int p_delta_rows, const float theta_scale) { +template +static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale, + const int p_delta_rows, const float theta_scale) { const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (col >= ncols) { @@ -4387,8 +4413,11 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco const int row = blockDim.x*blockIdx.x + threadIdx.x; const int i = row*ncols + col/2; + const int i2 = row/p_delta_rows; - const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2); + const int p = has_pos ? pos[i2] : 0; + const float p0 = p*freq_scale; + const float theta = p0*powf(theta_scale, col/2); const float sin_theta = sinf(theta); const float cos_theta = cosf(theta); @@ -4399,8 +4428,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco dst[i + ncols/2] = x0*sin_theta + x1*cos_theta; } -static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0, - const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) { +static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale, + const int p_delta_rows, const float theta_scale, const int n_ctx) { const int col = blockDim.x*blockIdx.x + threadIdx.x; const int half_n_dims = ncols/4; @@ -4410,11 +4439,13 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol const int row = blockDim.y*blockIdx.y + threadIdx.y; const int i = row*ncols + col; + const int i2 = row/p_delta_rows; const float col_theta_scale = powf(theta_scale, col); - const float p = p0 + p_delta*(row/p_delta_rows); + // FIXME: this is likely wrong + const int p = pos != nullptr ? pos[i2] : 0; - const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale; + const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale; const float sin_theta = sinf(theta); const float cos_theta = cosf(theta); @@ -4424,7 +4455,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol dst[i + 0] = x0*cos_theta - x1*sin_theta; dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta; - const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale; + const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale; const float sin_block_theta = sinf(block_theta); const float cos_block_theta = cosf(block_theta); @@ -4578,32 +4609,38 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con quantize_q8_1<<>>(x, vy, kx, kx_padded); } -static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; dequantize_block<<>>(vx, y, k); } -static void dequantize_row_q4_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; dequantize_block<<>>(vx, y, k); } -static void dequantize_row_q5_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; dequantize_block<<>>(vx, y, k); } -static void dequantize_row_q5_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; dequantize_block<<>>(vx, y, k); } -static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; dequantize_block<<>>(vx, y, k); } -static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; #if QK_K == 256 dequantize_block_q2_K<<>>(vx, y); @@ -4612,7 +4649,8 @@ static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cu #endif } -static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; #if QK_K == 256 dequantize_block_q3_K<<>>(vx, y); @@ -4621,12 +4659,14 @@ static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cu #endif } -static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; dequantize_block_q4_K<<>>(vx, y); } -static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; #if QK_K == 256 dequantize_block_q5_K<<>>(vx, y); @@ -4635,7 +4675,8 @@ static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cu #endif } -static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; #if QK_K == 256 dequantize_block_q6_K<<>>(vx, y); @@ -4826,6 +4867,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c dequantize_block<1, 1, convert_f16><<>>(vx, y, k); } +static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; + dequantize_block<1, 1, convert_f32><<>>(vx, y, k); +} + static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; @@ -4835,6 +4881,35 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa <<>>(vx, y, dst, ncols, nrows); } +static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return dequantize_row_q4_0_cuda; + case GGML_TYPE_Q4_1: + return dequantize_row_q4_1_cuda; + case GGML_TYPE_Q5_0: + return dequantize_row_q5_0_cuda; + case GGML_TYPE_Q5_1: + return dequantize_row_q5_1_cuda; + case GGML_TYPE_Q8_0: + return dequantize_row_q8_0_cuda; + case GGML_TYPE_Q2_K: + return dequantize_row_q2_K_cuda; + case GGML_TYPE_Q3_K: + return dequantize_row_q3_K_cuda; + case GGML_TYPE_Q4_K: + return dequantize_row_q4_K_cuda; + case GGML_TYPE_Q5_K: + return dequantize_row_q5_K_cuda; + case GGML_TYPE_Q6_K: + return dequantize_row_q6_K_cuda; + case GGML_TYPE_F32: + return convert_fp32_to_fp16_cuda; + default: + return nullptr; + } +} + static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: @@ -4881,7 +4956,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda( mmq_x = MMQ_X_Q4_0_RDNA1; mmq_y = MMQ_Y_Q4_0_RDNA1; nwarps = NWARPS_Q4_0_RDNA1; - } else if (compute_capability >= CC_TURING) { + } else if (compute_capability >= CC_VOLTA) { mmq_x = MMQ_X_Q4_0_AMPERE; mmq_y = MMQ_Y_Q4_0_AMPERE; nwarps = NWARPS_Q4_0_AMPERE; @@ -4926,7 +5001,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda( mmq_x = MMQ_X_Q4_1_RDNA1; mmq_y = MMQ_Y_Q4_1_RDNA1; nwarps = NWARPS_Q4_1_RDNA1; - } else if (compute_capability >= CC_TURING) { + } else if (compute_capability >= CC_VOLTA) { mmq_x = MMQ_X_Q4_1_AMPERE; mmq_y = MMQ_Y_Q4_1_AMPERE; nwarps = NWARPS_Q4_1_AMPERE; @@ -4971,7 +5046,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda( mmq_x = MMQ_X_Q5_0_RDNA1; mmq_y = MMQ_Y_Q5_0_RDNA1; nwarps = NWARPS_Q5_0_RDNA1; - } else if (compute_capability >= CC_TURING) { + } else if (compute_capability >= CC_VOLTA) { mmq_x = MMQ_X_Q5_0_AMPERE; mmq_y = MMQ_Y_Q5_0_AMPERE; nwarps = NWARPS_Q5_0_AMPERE; @@ -5016,7 +5091,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda( mmq_x = MMQ_X_Q5_1_RDNA1; mmq_y = MMQ_Y_Q5_1_RDNA1; nwarps = NWARPS_Q5_1_RDNA1; - } else if (compute_capability >= CC_TURING) { + } else if (compute_capability >= CC_VOLTA) { mmq_x = MMQ_X_Q5_1_AMPERE; mmq_y = MMQ_Y_Q5_1_AMPERE; nwarps = NWARPS_Q5_1_AMPERE; @@ -5061,7 +5136,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda( mmq_x = MMQ_X_Q8_0_RDNA1; mmq_y = MMQ_Y_Q8_0_RDNA1; nwarps = NWARPS_Q8_0_RDNA1; - } else if (compute_capability >= CC_TURING) { + } else if (compute_capability >= CC_VOLTA) { mmq_x = MMQ_X_Q8_0_AMPERE; mmq_y = MMQ_Y_Q8_0_AMPERE; nwarps = NWARPS_Q8_0_AMPERE; @@ -5106,7 +5181,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda( mmq_x = MMQ_X_Q2_K_RDNA1; mmq_y = MMQ_Y_Q2_K_RDNA1; nwarps = NWARPS_Q2_K_RDNA1; - } else if (compute_capability >= CC_TURING) { + } else if (compute_capability >= CC_VOLTA) { mmq_x = MMQ_X_Q2_K_AMPERE; mmq_y = MMQ_Y_Q2_K_AMPERE; nwarps = NWARPS_Q2_K_AMPERE; @@ -5153,7 +5228,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda( mmq_x = MMQ_X_Q3_K_RDNA1; mmq_y = MMQ_Y_Q3_K_RDNA1; nwarps = NWARPS_Q3_K_RDNA1; - } else if (compute_capability >= CC_TURING) { + } else if (compute_capability >= CC_VOLTA) { mmq_x = MMQ_X_Q3_K_AMPERE; mmq_y = MMQ_Y_Q3_K_AMPERE; nwarps = NWARPS_Q3_K_AMPERE; @@ -5199,7 +5274,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda( mmq_x = MMQ_X_Q4_K_RDNA1; mmq_y = MMQ_Y_Q4_K_RDNA1; nwarps = NWARPS_Q4_K_RDNA1; - } else if (compute_capability >= CC_TURING) { + } else if (compute_capability >= CC_VOLTA) { mmq_x = MMQ_X_Q4_K_AMPERE; mmq_y = MMQ_Y_Q4_K_AMPERE; nwarps = NWARPS_Q4_K_AMPERE; @@ -5244,7 +5319,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda( mmq_x = MMQ_X_Q5_K_RDNA1; mmq_y = MMQ_Y_Q5_K_RDNA1; nwarps = NWARPS_Q5_K_RDNA1; - } else if (compute_capability >= CC_TURING) { + } else if (compute_capability >= CC_VOLTA) { mmq_x = MMQ_X_Q5_K_AMPERE; mmq_y = MMQ_Y_Q5_K_AMPERE; nwarps = NWARPS_Q5_K_AMPERE; @@ -5289,7 +5364,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda( mmq_x = MMQ_X_Q6_K_RDNA1; mmq_y = MMQ_Y_Q6_K_RDNA1; nwarps = NWARPS_Q6_K_RDNA1; - } else if (compute_capability >= CC_TURING) { + } else if (compute_capability >= CC_VOLTA) { mmq_x = MMQ_X_Q6_K_AMPERE; mmq_y = MMQ_Y_Q6_K_AMPERE; nwarps = NWARPS_Q6_K_AMPERE; @@ -5361,31 +5436,41 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons scale_f32<<>>(x, dst, scale, k); } -static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, - const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) { +template +static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale, + const int p_delta_rows, const float theta_scale, cudaStream_t stream) { GGML_ASSERT(ncols % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const dim3 block_nums(nrows, num_blocks_x, 1); - rope_f32<<>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale); + if (pos == nullptr) { + rope<<>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale); + } else { + rope<<>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale); + } } -static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, - const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) { +template +static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale, + const int p_delta_rows, const float theta_scale, cudaStream_t stream) { GGML_ASSERT(ncols % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const dim3 block_nums(nrows, num_blocks_x, 1); - rope_neox_f32<<>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale); + if (pos == nullptr) { + rope_neox<<>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale); + } else { + rope_neox<<>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale); + } } -static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, - const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) { +static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale, + const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) { GGML_ASSERT(ncols % 4 == 0); const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1); const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE; const dim3 block_nums(num_blocks_x, nrows, 1); - rope_glm_f32<<>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx); + rope_glm_f32<<>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx); } static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, @@ -5853,7 +5938,7 @@ static int64_t get_row_rounding(ggml_type type) { switch(type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: - return max_compute_capability >= CC_TURING ? 128 : 64; + return max_compute_capability >= CC_VOLTA ? 128 : 64; case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: @@ -5864,7 +5949,7 @@ static int64_t get_row_rounding(ggml_type type) { case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: - return max_compute_capability >= CC_TURING ? 128 : 64; + return max_compute_capability >= CC_VOLTA ? 128 : 64; case GGML_TYPE_Q6_K: return 64; default: @@ -6012,8 +6097,6 @@ inline void ggml_cuda_op_mul_mat_cublas( GGML_ASSERT(src1_ddf_i != nullptr); GGML_ASSERT(dst_dd_i != nullptr); - const float alpha = 1.0f; - const float beta = 0.0f; const int64_t ne00 = src0->ne[0]; @@ -6022,16 +6105,6 @@ inline void ggml_cuda_op_mul_mat_cublas( const int64_t ne0 = dst->ne[0]; const int64_t row_diff = row_high - row_low; - float * src0_ddq_as_f32; - size_t src0_as = 0; - - if (src0->type != GGML_TYPE_F32) { - const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); - src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT - to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream); - } - const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32; - int id; CUDA_CHECK(cudaGetDevice(&id)); @@ -6039,16 +6112,87 @@ inline void ggml_cuda_op_mul_mat_cublas( // ldc == nrows of the matrix that cuBLAS writes into int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; - CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream)); - CUBLAS_CHECK( - cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, - row_diff, src1_ncols, ne10, - &alpha, src0_ddf_i, ne00, - src1_ddf_i, ne10, - &beta, dst_dd_i, ldc)); + const int compute_capability = g_compute_capabilities[id]; + + if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) { + // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 + half * src0_as_f16 = nullptr; + size_t src0_as = 0; + if (src0->type != GGML_TYPE_F16) { + const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type); + GGML_ASSERT(to_fp16_cuda != nullptr); + size_t ne = row_diff*ne00; + src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as); + to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream); + } + const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16; + + half * src1_as_f16 = nullptr; + size_t src1_as = 0; + if (src1->type != GGML_TYPE_F16) { + const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); + GGML_ASSERT(to_fp16_cuda != nullptr); + size_t ne = src1_ncols*ne10; + src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as); + to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream); + } + const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16; + + size_t dst_as = 0; + half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as); + + const half alpha_f16 = 1.0f; + const half beta_f16 = 0.0f; + + CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream)); + CUBLAS_CHECK( + cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, + row_diff, src1_ncols, ne10, + &alpha_f16, src0_ptr, CUDA_R_16F, ne00, + src1_ptr, CUDA_R_16F, ne10, + &beta_f16, dst_f16, CUDA_R_16F, ldc, + CUBLAS_COMPUTE_16F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); + to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream); + + ggml_cuda_pool_free(dst_f16, dst_as); + + if (src0_as != 0) { + ggml_cuda_pool_free(src0_as_f16, src0_as); + } - if (src0_as > 0) { - ggml_cuda_pool_free(src0_ddq_as_f32, src0_as); + if (src1_as != 0) { + ggml_cuda_pool_free(src1_as_f16, src1_as); + } + } + else { + float * src0_ddq_as_f32 = nullptr; + size_t src0_as = 0; + + if (src0->type != GGML_TYPE_F32) { + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); + GGML_ASSERT(to_fp32_cuda != nullptr); + src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT + to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream); + } + const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32; + + const float alpha = 1.0f; + const float beta = 0.0f; + + CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream)); + CUBLAS_CHECK( + cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, + row_diff, src1_ncols, ne10, + &alpha, src0_ddf_i, ne00, + src1_ddf_i, ne10, + &beta, dst_dd_i, ldc)); + + if (src0_as != 0) { + ggml_cuda_pool_free(src0_ddq_as_f32, src0_as); + } } (void) dst; @@ -6060,14 +6204,16 @@ inline void ggml_cuda_op_rope( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); + GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + GGML_ASSERT(src0->type == dst->type); const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; + const int64_t ne2 = dst->ne[2]; const int64_t nrows = ggml_nrows(src0); - const int n_past = ((int32_t *) dst->op_params)[0]; + //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; const int n_ctx = ((int32_t *) dst->op_params)[3]; @@ -6078,19 +6224,38 @@ inline void ggml_cuda_op_rope( memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); const float theta_scale = powf(freq_base, -2.0f/n_dims); - const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale; + + const int32_t * pos = nullptr; + if ((mode & 1) == 0) { + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(src1->ne[0] == ne2); + pos = (const int32_t *) src1_dd; + } const bool is_neox = mode & 2; const bool is_glm = mode & 4; // compute if (is_glm) { - rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream); + GGML_ASSERT(false); + rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream); } else if (is_neox) { GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet"); - rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream); + if (src0->type == GGML_TYPE_F32) { + rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream); + } else if (src0->type == GGML_TYPE_F16) { + rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream); + } else { + GGML_ASSERT(false); + } } else { - rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream); + if (src0->type == GGML_TYPE_F32) { + rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream); + } else if (src0->type == GGML_TYPE_F16) { + rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream); + } else { + GGML_ASSERT(false); + } } (void) src1; @@ -6261,7 +6426,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s } } -void ggml_cuda_set_peer_access(const int n_tokens) { +static void ggml_cuda_set_peer_access(const int n_tokens) { static bool peer_access_enabled = false; const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE; @@ -6589,27 +6754,27 @@ static void ggml_cuda_op_mul_mat( } } -void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add); } -void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul); } -void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu); } -void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu); } -void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm); } -void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm); } @@ -6620,17 +6785,13 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te const int64_t ne1 = dst->ne[1]; // TODO: find the optimal values for these - if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && - src1->type == GGML_TYPE_F32 && - dst->type == GGML_TYPE_F32 && - (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { - return true; - } - - return false; + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && + src1->type == GGML_TYPE_F32 && + dst->type == GGML_TYPE_F32 && + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32); } -void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ +static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation @@ -6659,7 +6820,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream); } -void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ +static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); @@ -6693,7 +6854,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1 ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream); } -void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU; @@ -6737,11 +6898,11 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ } } -void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale); } -void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); @@ -6783,35 +6944,37 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); } else { + fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, + ggml_type_name(src0->type), ggml_type_name(src1->type)); GGML_ASSERT(false); } (void) dst; } -void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_cpy(src0, dst, nullptr); (void) src1; } -void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf); } -void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max); } -void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope); } -void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi); } -void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { (void) src0; (void) src1; (void) dst; @@ -6934,11 +7097,13 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { return extra; } -void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) { +static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) { if (scratch && g_scratch_size == 0) { return; } + tensor->backend = GGML_BACKEND_GPU; + // recursively assign CUDA buffers until a compute tensor is found if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { const ggml_op src0_op = tensor->src[0]->op; @@ -6950,8 +7115,6 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc); } - tensor->backend = GGML_BACKEND_GPU; - if (scratch && no_alloc) { return; } @@ -7036,6 +7199,15 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) tensor->extra = extra; } +void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) { + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(ggml_is_contiguous(tensor)); + + struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice)); +} + void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) { ggml_cuda_assign_buffers_impl(tensor, true, false, false); } @@ -7071,7 +7243,12 @@ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) { } void ggml_cuda_set_scratch_size(const size_t scratch_size) { - g_scratch_size = scratch_size; + // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously + // it still won't always work as expected, but it's better than nothing + if (scratch_size > g_scratch_size) { + ggml_cuda_free_scratch(); + } + g_scratch_size = std::max(g_scratch_size, scratch_size); } void ggml_cuda_free_scratch() { diff --git a/ggml-cuda.h b/ggml-cuda.h index a72e82069b9f1430fdbcc007f93ae26ea6c2f924..fda704b6656234ae63e1399fc680c5e5cf6c4a0d 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -31,6 +31,7 @@ GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tens GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor); GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset); +GGML_API void ggml_cuda_copy_to_device(struct ggml_tensor * tensor); GGML_API void ggml_cuda_set_main_device(int main_device); GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q); diff --git a/ggml-metal.h b/ggml-metal.h index fca28d37ef97069ca7800fc890ea4d013c46437f..790cf0bf7b963d9c1f52a45dbac5be71e0662256 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -19,6 +19,8 @@ #pragma once +#include "ggml.h" + #include #include @@ -33,6 +35,8 @@ struct ggml_cgraph; extern "C" { #endif +void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); + struct ggml_metal_context; // number of command buffers to use diff --git a/ggml-metal.m b/ggml-metal.m index 917089fcbeb23a03cbb8a19dac4a7f611da2864a..b4a48317d03ab24cb04072ea5d38af92f89132a3 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -11,11 +11,14 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) -// TODO: temporary - reuse llama.cpp logging #ifdef GGML_METAL_NDEBUG -#define metal_printf(...) +#define GGML_METAL_LOG_INFO(...) +#define GGML_METAL_LOG_WARN(...) +#define GGML_METAL_LOG_ERROR(...) #else -#define metal_printf(...) fprintf(stderr, __VA_ARGS__) +#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) +#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) +#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) #endif #define UNUSED(x) (void)(x) @@ -100,7 +103,8 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32); - GGML_METAL_DECL_KERNEL(rope); + GGML_METAL_DECL_KERNEL(rope_f32); + GGML_METAL_DECL_KERNEL(rope_f16); GGML_METAL_DECL_KERNEL(alibi_f32); GGML_METAL_DECL_KERNEL(cpy_f32_f16); GGML_METAL_DECL_KERNEL(cpy_f32_f32); @@ -120,8 +124,37 @@ static NSString * const msl_library_source = @"see metal.metal"; @implementation GGMLMetalClass @end +ggml_log_callback ggml_metal_log_callback = NULL; +void * ggml_metal_log_user_data = NULL; + +void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) { + ggml_metal_log_callback = log_callback; + ggml_metal_log_user_data = user_data; +} + +static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){ + if (ggml_metal_log_callback != NULL) { + va_list args; + va_start(args, format); + char buffer[128]; + int len = vsnprintf(buffer, 128, format, args); + if (len < 128) { + ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data); + } else { + char* buffer2 = malloc(len+1); + vsnprintf(buffer2, len+1, format, args); + buffer2[len] = 0; + ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data); + free(buffer2); + } + va_end(args); + } +} + + + struct ggml_metal_context * ggml_metal_init(int n_cb) { - metal_printf("%s: allocating\n", __func__); + GGML_METAL_LOG_INFO("%s: allocating\n", __func__); id device; NSString * s; @@ -131,14 +164,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { NSArray * devices = MTLCopyAllDevices(); for (device in devices) { s = [device name]; - metal_printf("%s: found device: %s\n", __func__, [s UTF8String]); + GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]); } #endif // Pick and show default Metal device device = MTLCreateSystemDefaultDevice(); s = [device name]; - metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]); + GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]); // Configure context struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); @@ -165,7 +198,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; if (error) { - metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -179,11 +212,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; - metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]); + GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]); NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; if (error) { - metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } @@ -195,7 +228,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; #endif if (error) { - metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -207,11 +240,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { #define GGML_METAL_ADD_KERNEL(name) \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ - metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ + GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ (int) ctx->pipeline_##name.threadExecutionWidth); \ if (error) { \ - metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ return NULL; \ } @@ -261,7 +294,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); - GGML_METAL_ADD_KERNEL(rope); + GGML_METAL_ADD_KERNEL(rope_f32); + GGML_METAL_ADD_KERNEL(rope_f16); GGML_METAL_ADD_KERNEL(alibi_f32); GGML_METAL_ADD_KERNEL(cpy_f32_f16); GGML_METAL_ADD_KERNEL(cpy_f32_f32); @@ -270,13 +304,13 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { #undef GGML_METAL_ADD_KERNEL } - metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); + GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); #if TARGET_OS_OSX - metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.maxTransferRate != 0) { - metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); } else { - metal_printf("%s: maxTransferRate = built-in GPU\n", __func__); + GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__); } #endif @@ -284,7 +318,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { } void ggml_metal_free(struct ggml_metal_context * ctx) { - metal_printf("%s: deallocating\n", __func__); + GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); #define GGML_METAL_DEL_KERNEL(name) \ [ctx->function_##name release]; \ [ctx->pipeline_##name release]; @@ -335,7 +369,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); - GGML_METAL_DEL_KERNEL(rope); + GGML_METAL_DEL_KERNEL(rope_f32); + GGML_METAL_DEL_KERNEL(rope_f16); GGML_METAL_DEL_KERNEL(alibi_f32); GGML_METAL_DEL_KERNEL(cpy_f32_f16); GGML_METAL_DEL_KERNEL(cpy_f32_f32); @@ -360,7 +395,7 @@ void * ggml_metal_host_malloc(size_t n) { void * data = NULL; const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); if (result != 0) { - metal_printf("%s: error: posix_memalign failed\n", __func__); + GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__); return NULL; } @@ -388,7 +423,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) { // Metal buffer based on the host memory pointer // static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { - //metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); + //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = ggml_nbytes(t); @@ -400,13 +435,13 @@ static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { *offs = (size_t) ioffs; - //metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); + //GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); return ctx->buffers[i].metal; } } - metal_printf("%s: error: buffer is nil\n", __func__); + GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__); return nil; } @@ -418,7 +453,7 @@ bool ggml_metal_add_buffer( size_t size, size_t max_size) { if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { - metal_printf("%s: too many buffers\n", __func__); + GGML_METAL_LOG_ERROR("%s: error: too many buffers\n", __func__); return false; } @@ -428,7 +463,7 @@ bool ggml_metal_add_buffer( const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { - metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); + GGML_METAL_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); return false; } } @@ -449,11 +484,11 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); return false; } - metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); ++ctx->n_buffers; } else { @@ -473,13 +508,13 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); + GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); return false; } - metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); + GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); if (i + size_step < size) { - metal_printf("\n"); + GGML_METAL_LOG_INFO("\n"); } ++ctx->n_buffers; @@ -487,17 +522,17 @@ bool ggml_metal_add_buffer( } #if TARGET_OS_OSX - metal_printf(", (%8.2f / %8.2f)", + GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", ctx->device.currentAllocatedSize / 1024.0 / 1024.0, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { - metal_printf(", warning: current allocated size is greater than the recommended max working set size\n"); + GGML_METAL_LOG_WARN(", warning: current allocated size is greater than the recommended max working set size\n", __func__); } else { - metal_printf("\n"); + GGML_METAL_LOG_INFO("\n"); } #else - metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); #endif } @@ -610,7 +645,7 @@ void ggml_metal_graph_find_concurrency( } if (ctx->concur_list_len > GGML_MAX_CONCUR) { - metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__); + GGML_METAL_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__); } } @@ -664,7 +699,7 @@ void ggml_metal_graph_compute( continue; } - //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); struct ggml_tensor * src0 = gf->nodes[i]->src[0]; struct ggml_tensor * src1 = gf->nodes[i]->src[1]; @@ -708,17 +743,17 @@ void ggml_metal_graph_compute( id id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; id id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; - //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op)); + //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); //if (src0) { - // metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, + // GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, // ggml_is_contiguous(src0), src0->name); //} //if (src1) { - // metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, + // GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, // ggml_is_contiguous(src1), src1->name); //} //if (dst) { - // metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, + // GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, // dst->name); //} @@ -736,25 +771,59 @@ void ggml_metal_graph_compute( GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src1)); - // utilize float4 - GGML_ASSERT(ne00 % 4 == 0); - const int64_t nb = ne00/4; + bool bcast_row = false; - if (ggml_nelements(src1) == ne10) { + int64_t nb = ne00; + + if (ggml_nelements(src1) == ne10 && ne00 % 4 == 0) { // src1 is a row GGML_ASSERT(ne11 == 1); + + nb = ne00 / 4; [encoder setComputePipelineState:ctx->pipeline_add_row]; + + bcast_row = true; } else { [encoder setComputePipelineState:ctx->pipeline_add]; } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&nb length:sizeof(nb) atIndex:3]; - - const int64_t n = ggml_nelements(dst)/4; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; + [encoder setBytes:&nb length:sizeof(nb) atIndex:27]; + + if (bcast_row) { + const int64_t n = ggml_nelements(dst)/4; + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } else { + const int nth = MIN(1024, ne0); - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } } break; case GGML_OP_MUL: { @@ -830,13 +899,13 @@ void ggml_metal_graph_compute( } break; default: { - metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } break; case GGML_OP_SOFT_MAX: { - const int nth = 32; + const int nth = MIN(32, ne00); if (ne00%4 == 0) { [encoder setComputePipelineState:ctx->pipeline_soft_max_4]; @@ -889,7 +958,7 @@ void ggml_metal_graph_compute( src1t == GGML_TYPE_F32 && [ctx->device supportsFamily:MTLGPUFamilyApple7] && ne00%32 == 0 && - ne11 > 1) { + ne11 > 2) { switch (src0->type) { case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break; case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break; @@ -1019,7 +1088,7 @@ void ggml_metal_graph_compute( } break; default: { - metal_printf("Asserting on type %d\n",(int)src0t); + GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); GGML_ASSERT(false && "not implemented"); } }; @@ -1100,7 +1169,7 @@ void ggml_metal_graph_compute( float eps; memcpy(&eps, dst->op_params, sizeof(float)); - const int nth = 512; + const int nth = MIN(512, ne00); [encoder setComputePipelineState:ctx->pipeline_rms_norm]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -1119,7 +1188,7 @@ void ggml_metal_graph_compute( float eps; memcpy(&eps, dst->op_params, sizeof(float)); - const int nth = 256; + const int nth = MIN(256, ne00); [encoder setComputePipelineState:ctx->pipeline_norm]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -1137,6 +1206,8 @@ void ggml_metal_graph_compute( { GGML_ASSERT((src0t == GGML_TYPE_F32)); + const int nth = MIN(1024, ne00); + const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past); const int n_head = ((int32_t *) dst->op_params)[1]; float max_bias; @@ -1170,12 +1241,14 @@ void ggml_metal_graph_compute( [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; [encoder setBytes:&m0 length:sizeof( float) atIndex:18]; - const int nth = 32; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case GGML_OP_ROPE: { + GGML_ASSERT(ne10 == ne02); + + const int nth = MIN(1024, ne00); + const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; @@ -1185,38 +1258,44 @@ void ggml_metal_graph_compute( memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - [encoder setComputePipelineState:ctx->pipeline_rope]; + switch (src0->type) { + case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break; + case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_rope_f16]; break; + default: GGML_ASSERT(false); + }; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&n_past length:sizeof( int) atIndex:18]; - [encoder setBytes:&n_dims length:sizeof( int) atIndex:19]; - [encoder setBytes:&mode length:sizeof( int) atIndex:20]; - [encoder setBytes:&freq_base length:sizeof(float) atIndex:21]; - [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18]; + [encoder setBytes:&n_past length:sizeof( int) atIndex:19]; + [encoder setBytes:&n_dims length:sizeof( int) atIndex:20]; + [encoder setBytes:&mode length:sizeof( int) atIndex:21]; + [encoder setBytes:&freq_base length:sizeof(float) atIndex:22]; + [encoder setBytes:&freq_scale length:sizeof(float) atIndex:23]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case GGML_OP_DUP: case GGML_OP_CPY: case GGML_OP_CONT: { - const int nth = 32; + const int nth = MIN(1024, ne00); switch (src0t) { case GGML_TYPE_F32: @@ -1261,7 +1340,7 @@ void ggml_metal_graph_compute( } break; default: { - metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } @@ -1291,16 +1370,16 @@ void ggml_metal_graph_compute( NSError *error = (MTLCommandBufferError) [ctx->command_buffers[i] error]; int mtl_error_code = [error code]; if (([error domain] == MTLCommandBufferErrorDomain) && ([error code] == MTLCommandBufferErrorOutOfMemory)) { - metal_printf("%s: command buffer %d failed with status MTLCommandBufferStatus.error (5) and error code \ + GGML_METAL_LOG_INFO("%s: command buffer %d failed with status MTLCommandBufferStatus.error (5) and error code \ MTLCommandBufferError.outOfMemory (8)\n"); printf("Metal ran out of memory. Maybe try a smaller context size, or a smaller (more coarsely quantized) model, \ preferably one under the recommended max working set size, or else fall back to running on CPU only.\n"); } else { - metal_printf("%s: command buffer %d failed with status MTLCommandBufferStatus.error (5) and error code %d\n", + GGML_METAL_LOG_INFO("%s: command buffer %d failed with status MTLCommandBufferStatus.error (5) and error code %d\n", __func__, i, mtl_error_code); } } else { - metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status); + GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); } GGML_ASSERT(false); } diff --git a/ggml-metal.metal b/ggml-metal.metal index 7f1c3d9ea74bd031ae831bfe916ab8fa30d211a9..5e1af6a092aed940f3c14fbaca618d348181e8e7 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -24,12 +24,59 @@ typedef struct { int8_t qs[QK8_0]; // quants } block_q8_0; +// general-purpose kernel for addition of two tensors +// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3 +// cons: not very efficient kernel void kernel_add( - device const float4 * src0, - device const float4 * src1, - device float4 * dst, - uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] + src1[tpig]; + device const char * src0, + device const char * src1, + device char * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant int64_t & nb00, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & nb03, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & nb13, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant int64_t & nb0, + constant int64_t & nb1, + constant int64_t & nb2, + constant int64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig.z; + const int64_t i02 = tgpig.y; + const int64_t i01 = tgpig.x; + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00; + device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10; + device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0; + + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0] + ((device float *)src1_ptr)[0]; + + src0_ptr += ntg.x*nb00; + src1_ptr += ntg.x*nb10; + dst_ptr += ntg.x*nb0; + } } // assumption: src1 is a row @@ -38,7 +85,7 @@ kernel void kernel_add_row( device const float4 * src0, device const float4 * src1, device float4 * dst, - constant int64_t & nb, + constant int64_t & nb [[buffer(27)]], uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] + src1[tpig % nb]; } @@ -806,30 +853,61 @@ kernel void kernel_alibi_f32( } } +typedef void (rope_t)( + device const void * src0, + device const int32_t * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + constant int & n_past, + constant int & n_dims, + constant int & mode, + constant float & freq_base, + constant float & freq_scale, + uint tiitg[[thread_index_in_threadgroup]], + uint3 tptg[[threads_per_threadgroup]], + uint3 tgpig[[threadgroup_position_in_grid]]); + +template kernel void kernel_rope( - device const void * src0, - device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne03, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant uint64_t & nb03, - constant int64_t & ne0, - constant int64_t & ne1, - constant int64_t & ne2, - constant int64_t & ne3, - constant uint64_t & nb0, - constant uint64_t & nb1, - constant uint64_t & nb2, - constant uint64_t & nb3, - constant int & n_past, - constant int & n_dims, - constant int & mode, - constant float & freq_base, - constant float & freq_scale, + device const void * src0, + device const int32_t * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + constant int & n_past, + constant int & n_dims, + constant int & mode, + constant float & freq_base, + constant float & freq_scale, uint tiitg[[thread_index_in_threadgroup]], uint3 tptg[[threads_per_threadgroup]], uint3 tgpig[[threadgroup_position_in_grid]]) { @@ -839,7 +917,9 @@ kernel void kernel_rope( const bool is_neox = mode & 2; - const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); + device const int32_t * pos = src1; + + const int64_t p = pos[i2]; const float theta_0 = freq_scale * (float)p; const float inv_ndims = -1.f/n_dims; @@ -851,11 +931,11 @@ kernel void kernel_rope( const float cos_theta = cos(theta); const float sin_theta = sin(theta); - device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float x0 = src[0]; - const float x1 = src[1]; + const T x0 = src[0]; + const T x1 = src[1]; dst_data[0] = x0*cos_theta - x1*sin_theta; dst_data[1] = x0*sin_theta + x1*cos_theta; @@ -870,8 +950,8 @@ kernel void kernel_rope( const int64_t i0 = ib*n_dims + ic/2; - device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); const float x0 = src[0]; const float x1 = src[n_dims/2]; @@ -883,6 +963,9 @@ kernel void kernel_rope( } } +template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope; +template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope; + kernel void kernel_cpy_f16_f16( device const half * src0, device half * dst, @@ -1273,8 +1356,8 @@ kernel void kernel_mul_mat_q3_K_f32( float yl[32]; - const uint16_t kmask1 = 0x3030; - const uint16_t kmask2 = 0x0f0f; + //const uint16_t kmask1 = 0x3030; + //const uint16_t kmask2 = 0x0f0f; const int tid = tiisg/4; const int ix = tiisg%4; diff --git a/ggml.c b/ggml.c index 81c40041e4deef5f7608b67ff11ed87f4e168add..e877c4e5e4fbda8249d237c8d290b7d264b8e116 100644 --- a/ggml.c +++ b/ggml.c @@ -89,7 +89,9 @@ static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(vo static int pthread_join(pthread_t thread, void * unused) { (void) unused; - return (int) WaitForSingleObject(thread, INFINITE); + int ret = (int) WaitForSingleObject(thread, INFINITE); + CloseHandle(thread); + return ret; } static int sched_yield (void) { @@ -134,6 +136,7 @@ typedef void * thread_ret_t; #define GGML_SOFT_MAX_UNROLL 4 #define GGML_VEC_DOT_UNROLL 2 +#define GGML_VEC_MAD_UNROLL 32 // // logging @@ -242,18 +245,18 @@ inline static void * ggml_aligned_malloc(size_t size) { // #define GGML_TENSOR_UNARY_OP_LOCALS \ - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ - GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) #define GGML_TENSOR_BINARY_OP_LOCALS \ - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ - GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); \ - GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); \ - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ - GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) #if defined(GGML_USE_ACCELERATE) #include @@ -1864,7 +1867,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { #define GGML_F16x8_ADD vaddq_f16 #define GGML_F16x8_MUL vmulq_f16 #define GGML_F16x8_REDUCE(res, x) \ - { \ + do { \ int offset = GGML_F16_ARR >> 1; \ for (int i = 0; i < offset; ++i) { \ x[i] = vaddq_f16(x[i], x[offset+i]); \ @@ -1880,7 +1883,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ - } + } while (0) #define GGML_F16_VEC GGML_F16x8 #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO @@ -1941,7 +1944,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { #define GGML_F32x8_ADD _mm256_add_ps #define GGML_F32x8_MUL _mm256_mul_ps #define GGML_F32x8_REDUCE(res, x) \ -{ \ +do { \ int offset = GGML_F32_ARR >> 1; \ for (int i = 0; i < offset; ++i) { \ x[i] = _mm256_add_ps(x[i], x[offset+i]); \ @@ -1958,7 +1961,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { _mm256_extractf128_ps(x[0], 1)); \ const __m128 t1 = _mm_hadd_ps(t0, t0); \ res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ -} +} while (0) // TODO: is this optimal ? #define GGML_F32_VEC GGML_F32x8 @@ -3708,6 +3711,58 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float #endif } +// xs and vs are byte strides of x and v +inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) { + + const float * restrict x[GGML_VEC_MAD_UNROLL]; + const float * restrict v[GGML_VEC_MAD_UNROLL]; + + for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) { + x[i] = (const float *) ((const char *) xv + i*xs); + v[i] = (const float *) ((const char *) vv + i*vs); + } + +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F32_STEP - 1)); + + GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL]; + + for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { + vx[k] = GGML_F32_VEC_SET1(v[k][0]); + } + + GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR]; + GGML_F32_VEC ay[GGML_F32_ARR]; + + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; j++) { + ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); + + for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { + ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]); + } + + GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); + } + } + + // leftovers + for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { + for (int i = np; i < n; ++i) { + y[i] += x[k][i]*v[k][0]; + } + } +#else + // scalar + for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { + for (int i = 0; i < n; ++i) { + y[i] += x[k][i]*v[k][0]; + } + } +#endif +} + //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { #if defined(GGML_USE_ACCELERATE) @@ -4393,10 +4448,9 @@ static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - return - (t0->ne[1] == t1->ne[1]) && - (t0->ne[2] == t1->ne[2]) && - (t0->ne[3] == t1->ne[3]); + return (t0->ne[1] == t1->ne[1]) && + (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable + (t1->ne[3]%t0->ne[3] == 0); } enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { @@ -5066,43 +5120,78 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { return tensor; } +void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3) { + const int64_t ne2 = tensor->ne[2]; + const int64_t ne1 = tensor->ne[1]; + const int64_t ne0 = tensor->ne[0]; + + const int64_t i3_ = (i/(ne2*ne1*ne0)); + const int64_t i2_ = (i - i3_*ne2*ne1*ne0)/(ne1*ne0); + const int64_t i1_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0)/ne0; + const int64_t i0_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0 - i1_*ne0); + + if (i0) { + * i0 = i0_; + } + if (i1) { + * i1 = i1_; + } + if (i2) { + * i2 = i2_; + } + if (i3) { + * i3 = i3_; + } +} + int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { + if (!ggml_is_contiguous(tensor)) { + int64_t id[4] = { 0, 0, 0, 0 }; + ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); + return ggml_get_i32_nd(tensor, id[0], id[1], id[2], id[3]); + } switch (tensor->type) { case GGML_TYPE_I8: { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); return ((int8_t *)(tensor->data))[i]; - } break; + } case GGML_TYPE_I16: { GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); return ((int16_t *)(tensor->data))[i]; - } break; + } case GGML_TYPE_I32: { GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); return ((int32_t *)(tensor->data))[i]; - } break; + } case GGML_TYPE_F16: { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); - } break; + } case GGML_TYPE_F32: { GGML_ASSERT(tensor->nb[0] == sizeof(float)); return ((float *)(tensor->data))[i]; - } break; + } default: { GGML_ASSERT(false); - } break; + } } return 0.0f; } void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { + if (!ggml_is_contiguous(tensor)) { + int64_t id[4] = { 0, 0, 0, 0 }; + ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); + ggml_set_i32_nd(tensor, id[0], id[1], id[2], id[3], value); + return; + } switch (tensor->type) { case GGML_TYPE_I8: { @@ -5136,43 +5225,104 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { } } +int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) { + void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; + switch (tensor->type) { + case GGML_TYPE_I8: + return ((int8_t *) data)[0]; + case GGML_TYPE_I16: + return ((int16_t *) data)[0]; + case GGML_TYPE_I32: + return ((int32_t *) data)[0]; + case GGML_TYPE_F16: + return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); + case GGML_TYPE_F32: + return ((float *) data)[0]; + default: + GGML_ASSERT(false); + } + + return 0.0f; +} + +void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) { + void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; + switch (tensor->type) { + case GGML_TYPE_I8: + { + ((int8_t *)(data))[0] = value; + } break; + case GGML_TYPE_I16: + { + ((int16_t *)(data))[0] = value; + } break; + case GGML_TYPE_I32: + { + ((int32_t *)(data))[0] = value; + } break; + case GGML_TYPE_F16: + { + ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value); + } break; + case GGML_TYPE_F32: + { + ((float *)(data))[0] = value; + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { + if (!ggml_is_contiguous(tensor)) { + int64_t id[4] = { 0, 0, 0, 0 }; + ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); + return ggml_get_f32_nd(tensor, id[0], id[1], id[2], id[3]); + } switch (tensor->type) { case GGML_TYPE_I8: { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); return ((int8_t *)(tensor->data))[i]; - } break; + } case GGML_TYPE_I16: { GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); return ((int16_t *)(tensor->data))[i]; - } break; + } case GGML_TYPE_I32: { GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); return ((int32_t *)(tensor->data))[i]; - } break; + } case GGML_TYPE_F16: { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); - } break; + } case GGML_TYPE_F32: { GGML_ASSERT(tensor->nb[0] == sizeof(float)); return ((float *)(tensor->data))[i]; - } break; + } default: { GGML_ASSERT(false); - } break; + } } return 0.0f; } void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { + if (!ggml_is_contiguous(tensor)) { + int64_t id[4] = { 0, 0, 0, 0 }; + ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); + ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value); + return; + } switch (tensor->type) { case GGML_TYPE_I8: { @@ -5206,6 +5356,56 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { } } +float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) { + void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; + switch (tensor->type) { + case GGML_TYPE_I8: + return ((int8_t *) data)[0]; + case GGML_TYPE_I16: + return ((int16_t *) data)[0]; + case GGML_TYPE_I32: + return ((int32_t *) data)[0]; + case GGML_TYPE_F16: + return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); + case GGML_TYPE_F32: + return ((float *) data)[0]; + default: + GGML_ASSERT(false); + } + + return 0.0f; +} + +void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) { + void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; + switch (tensor->type) { + case GGML_TYPE_I8: + { + ((int8_t *)(data))[0] = value; + } break; + case GGML_TYPE_I16: + { + ((int16_t *)(data))[0] = value; + } break; + case GGML_TYPE_I32: + { + ((int32_t *)(data))[0] = value; + } break; + case GGML_TYPE_F16: + { + ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value); + } break; + case GGML_TYPE_F32: + { + ((float *)(data))[0] = value; + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + void * ggml_get_data(const struct ggml_tensor * tensor) { return tensor->data; } @@ -5348,6 +5548,44 @@ struct ggml_tensor * ggml_add_inplace( return ggml_add_impl(ctx, a, b, true); } +// ggml_add_cast + +static struct ggml_tensor * ggml_add_cast_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + enum ggml_type type) { + // TODO: support less-strict constraint + // GGML_ASSERT(ggml_can_repeat(b, a)); + GGML_ASSERT(ggml_can_repeat_rows(b, a)); + GGML_ASSERT(ggml_is_quantized(a->type)); // currently only supported for quantized input + + bool is_node = false; + + if (a->grad || b->grad) { + // TODO: support backward pass for broadcasting + GGML_ASSERT(ggml_are_same_shape(a, b)); + is_node = true; + } + + struct ggml_tensor * result = ggml_new_tensor(ctx, type, a->n_dims, a->ne); + + result->op = GGML_OP_ADD; + result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_add_cast( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + enum ggml_type type) { + return ggml_add_cast_impl(ctx, a, b, type); +} + // ggml_add1 static struct ggml_tensor * ggml_add1_impl( @@ -5784,7 +6022,6 @@ struct ggml_tensor * ggml_repeat( result->op = GGML_OP_REPEAT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -5812,7 +6049,6 @@ struct ggml_tensor * ggml_repeat_back( result->op = GGML_OP_REPEAT_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -6187,8 +6423,9 @@ struct ggml_tensor * ggml_out_prod( is_node = true; } - const int64_t ne[4] = { a->ne[0], b->ne[0], a->ne[2], b->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne); + // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3] + const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); result->op = GGML_OP_OUT_PROD; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6407,6 +6644,54 @@ struct ggml_tensor * ggml_cont_inplace( return ggml_cont_impl(ctx, a, true); } + +// make contiguous, with new shape +GGML_API struct ggml_tensor * ggml_cont_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0) { + return ggml_cont_4d(ctx, a, ne0, 1, 1, 1); +} + +GGML_API struct ggml_tensor * ggml_cont_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1) { + return ggml_cont_4d(ctx, a, ne0, ne1, 1, 1); +} + +GGML_API struct ggml_tensor * ggml_cont_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2) { + return ggml_cont_4d(ctx, a, ne0, ne1, ne2, 1); +} + +struct ggml_tensor * ggml_cont_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3) { + GGML_ASSERT(ggml_nelements(a) == (ne0*ne1*ne2*ne3)); + + bool is_node = false; + + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3); + ggml_format_name(result, "%s (cont)", a->name); + + result->op = GGML_OP_CONT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + + // ggml_reshape struct ggml_tensor * ggml_reshape( @@ -6414,7 +6699,7 @@ struct ggml_tensor * ggml_reshape( struct ggml_tensor * a, struct ggml_tensor * b) { GGML_ASSERT(ggml_is_contiguous(a)); - GGML_ASSERT(ggml_is_contiguous(b)); + // as only the shape of b is relevant, and not its memory layout, b is allowed to be non contiguous. GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b)); bool is_node = false; @@ -6787,7 +7072,6 @@ struct ggml_tensor * ggml_get_rows_back( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; - result->src[2] = c; return result; } @@ -6969,7 +7253,7 @@ struct ggml_tensor * ggml_soft_max_back_inplace( static struct ggml_tensor * ggml_rope_impl( struct ggml_context * ctx, struct ggml_tensor * a, - int n_past, + struct ggml_tensor * b, int n_dims, int mode, int n_ctx, @@ -6978,7 +7262,10 @@ static struct ggml_tensor * ggml_rope_impl( float xpos_base, bool xpos_down, bool inplace) { - GGML_ASSERT(n_past >= 0); + GGML_ASSERT(ggml_is_vector(b)); + GGML_ASSERT(b->type == GGML_TYPE_I32); + GGML_ASSERT(a->ne[2] == b->ne[0]); + bool is_node = false; if (a->grad) { @@ -6987,7 +7274,7 @@ static struct ggml_tensor * ggml_rope_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - int32_t params[8] = { n_past, n_dims, mode, n_ctx }; + int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx }; memcpy(params + 4, &freq_base, sizeof(float)); memcpy(params + 5, &freq_scale, sizeof(float)); memcpy(params + 6, &xpos_base, sizeof(float)); @@ -6997,6 +7284,7 @@ static struct ggml_tensor * ggml_rope_impl( result->op = GGML_OP_ROPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; + result->src[1] = b; return result; } @@ -7004,55 +7292,55 @@ static struct ggml_tensor * ggml_rope_impl( struct ggml_tensor * ggml_rope( struct ggml_context * ctx, struct ggml_tensor * a, - int n_past, + struct ggml_tensor * b, int n_dims, int mode, int n_ctx) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false); + return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false); } struct ggml_tensor * ggml_rope_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - int n_past, + struct ggml_tensor * b, int n_dims, int mode, int n_ctx) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true); + return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true); } struct ggml_tensor * ggml_rope_custom( struct ggml_context * ctx, struct ggml_tensor * a, - int n_past, + struct ggml_tensor * b, int n_dims, int mode, int n_ctx, float freq_base, float freq_scale) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false); + return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false); } struct ggml_tensor * ggml_rope_custom_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - int n_past, + struct ggml_tensor * b, int n_dims, int mode, int n_ctx, float freq_base, float freq_scale) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true); + return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true); } struct ggml_tensor * ggml_rope_xpos_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - int n_past, + struct ggml_tensor * b, int n_dims, float base, bool down) { - return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true); + return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true); } // ggml_rope_back @@ -7060,7 +7348,7 @@ struct ggml_tensor * ggml_rope_xpos_inplace( struct ggml_tensor * ggml_rope_back( struct ggml_context * ctx, struct ggml_tensor * a, - int n_past, + struct ggml_tensor * b, int n_dims, int mode, int n_ctx, @@ -7068,7 +7356,10 @@ struct ggml_tensor * ggml_rope_back( float freq_scale, float xpos_base, bool xpos_down) { - GGML_ASSERT(n_past >= 0); + GGML_ASSERT(ggml_is_vector(b)); + GGML_ASSERT(b->type == GGML_TYPE_I32); + GGML_ASSERT(a->ne[2] == b->ne[0]); + GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet"); bool is_node = false; @@ -7079,7 +7370,7 @@ struct ggml_tensor * ggml_rope_back( struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - int32_t params[8] = { n_past, n_dims, mode, n_ctx }; + int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx }; memcpy(params + 4, &freq_base, sizeof(float)); memcpy(params + 5, &freq_scale, sizeof(float)); memcpy(params + 6, &xpos_base, sizeof(float)); @@ -7089,6 +7380,7 @@ struct ggml_tensor * ggml_rope_back( result->op = GGML_OP_ROPE_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; + result->src[1] = b; return result; } @@ -7485,27 +7777,30 @@ struct ggml_tensor * ggml_flash_attn_back( // d shape [D,N,ne2,ne3] // q shape [D,N,ne2,ne3] - // k shape [D,M,ne2,ne3] - // v shape [M,D,ne2,ne3] + // k shape [D,M,kvne2,ne3] + // v shape [M,D,kvne2,ne3] - const int64_t D = q->ne[0]; - const int64_t N = q->ne[1]; - const int64_t M = k->ne[1]; - const int64_t ne2 = q->ne[2]; - const int64_t ne3 = q->ne[3]; + const int64_t D = q->ne[0]; + const int64_t N = q->ne[1]; + const int64_t M = k->ne[1]; + const int64_t ne2 = q->ne[2]; + const int64_t ne3 = q->ne[3]; + const int64_t kvne2 = k->ne[2]; GGML_ASSERT(k->ne[0] == D); GGML_ASSERT(v->ne[0] == M); GGML_ASSERT(v->ne[1] == D); GGML_ASSERT(d->ne[0] == D); GGML_ASSERT(d->ne[1] == N); - GGML_ASSERT(k->ne[2] == ne2); + GGML_ASSERT(k->ne[2] == kvne2); GGML_ASSERT(k->ne[3] == ne3); - GGML_ASSERT(v->ne[2] == ne2); + GGML_ASSERT(v->ne[2] == kvne2); GGML_ASSERT(v->ne[3] == ne3); GGML_ASSERT(d->ne[2] == ne2); GGML_ASSERT(d->ne[3] == ne3); + GGML_ASSERT(ne2 % kvne2 == 0); + bool is_node = false; if (q->grad || k->grad || v->grad) { @@ -7515,14 +7810,23 @@ struct ggml_tensor * ggml_flash_attn_back( } // store gradients of q, k and v as continuous tensors concatenated in result. - // q shape[D,N,ne2,ne3] ; k shape [D,M,ne2,ne3] ; v shape [M,D,ne2,ne3] - // gradq->data = result->data - // gradk->data = result->data + nb0*D*N*ne2*ne3 - // gradv->data = result->data + nb0*D*N*ne2*ne3 + nb0*D*M*ne2*ne3 // note: v and gradv are actually transposed, i.e. v->ne[0] != D. - int64_t ne[4] = {D,M+N+M,ne2,ne3}; + const int64_t elem_q = ggml_nelements(q); + const int64_t elem_k = ggml_nelements(k); + const int64_t elem_v = ggml_nelements(v); - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + enum ggml_type result_type = GGML_TYPE_F32; + GGML_ASSERT(ggml_blck_size(result_type) == 1); + const size_t tsize = ggml_type_size(result_type); + + const size_t offs_q = 0; + const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN); + const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN); + const size_t end = offs_v + GGML_PAD(elem_v * tsize, GGML_MEM_ALIGN); + + const size_t nelements = (end + tsize - 1)/tsize; + + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nelements); int32_t masked_i = masked ? 1 : 0; ggml_set_op_params(result, &masked_i, sizeof(masked_i)); @@ -8215,7 +8519,7 @@ static void ggml_compute_forward_dup_f16( return; } - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS const int ith = params->ith; // thread index const int nth = params->nth; // number of threads @@ -8486,7 +8790,7 @@ static void ggml_compute_forward_dup_f32( return; } - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS const int ith = params->ith; // thread index const int nth = params->nth; // number of threads @@ -8767,7 +9071,7 @@ static void ggml_compute_forward_add_f32( const int nr = ggml_nrows(src0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -8799,8 +9103,6 @@ static void ggml_compute_forward_add_f32( #else ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr); #endif - // } - // } } } else { // src1 is not contiguous @@ -8842,7 +9144,7 @@ static void ggml_compute_forward_add_f16_f32( const int nr = ggml_nrows(src0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -8896,7 +9198,7 @@ static void ggml_compute_forward_add_f16_f16( const int nr = ggml_nrows(src0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F16); @@ -8947,14 +9249,15 @@ static void ggml_compute_forward_add_q_f32( const int nr = ggml_nrows(src0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; const enum ggml_type type = src0->type; + const enum ggml_type dtype = dst->type; ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; - ggml_from_float_t const quantize_row_q = type_traits[type].from_float; + ggml_from_float_t const quantize_row_q = type_traits[dtype].from_float; // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == ggml_type_size(type)); @@ -8966,7 +9269,6 @@ static void ggml_compute_forward_add_q_f32( GGML_ASSERT(nb2 <= nb3); GGML_ASSERT(ggml_is_quantized(src0->type)); - GGML_ASSERT(dst->type == src0->type); GGML_ASSERT(src1->type == GGML_TYPE_F32); // rows per thread @@ -9004,7 +9306,11 @@ static void ggml_compute_forward_add_q_f32( // add src1 ggml_vec_acc_f32(ne00, wdata, src1_row); // quantize row to dst - quantize_row_q(wdata, dst_row, ne00); + if (quantize_row_q != NULL) { + quantize_row_q(wdata, dst_row, ne00); + } else { + memcpy(dst_row, wdata, ne0*nb0); + } } } @@ -9069,7 +9375,7 @@ static void ggml_compute_forward_add1_f32( const int nr = ggml_nrows(src0); - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -9124,7 +9430,7 @@ static void ggml_compute_forward_add1_f16_f32( const int nr = ggml_nrows(src0); - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -9174,7 +9480,7 @@ static void ggml_compute_forward_add1_f16_f16( const int nr = ggml_nrows(src0); - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F16); @@ -9224,7 +9530,7 @@ static void ggml_compute_forward_add1_q_f32( const int nr = ggml_nrows(src0); - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS const enum ggml_type type = src0->type; ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; @@ -9352,8 +9658,8 @@ static void ggml_compute_forward_acc_f32( const int nr = ggml_nrows(src1); const int nc = src1->ne[0]; - GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); - GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) // src0 and dst as viewed during acc const size_t nb0 = ggml_element_size(src0); @@ -9442,7 +9748,7 @@ static void ggml_compute_forward_sub_f32( const int nr = ggml_nrows(src0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -9532,7 +9838,7 @@ static void ggml_compute_forward_mul_f32( const int64_t nr = ggml_nrows(src0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -9623,7 +9929,7 @@ static void ggml_compute_forward_div_f32( const int nr = ggml_nrows(src0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -9832,8 +10138,8 @@ static void ggml_compute_forward_sum_f32( assert(ggml_is_scalar(dst)); assert(src0->nb[0] == sizeof(float)); - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) ggml_float sum = 0; ggml_float row_sum = 0; @@ -9864,8 +10170,8 @@ static void ggml_compute_forward_sum_f16( assert(src0->nb[0] == sizeof(ggml_fp16_t)); - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) float sum = 0; float row_sum = 0; @@ -9918,7 +10224,7 @@ static void ggml_compute_forward_sum_rows_f32( GGML_ASSERT(src0->nb[0] == sizeof(float)); GGML_ASSERT(dst->nb[0] == sizeof(float)); - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS GGML_ASSERT(ne0 == 1); GGML_ASSERT(ne1 == ne01); @@ -9968,7 +10274,7 @@ static void ggml_compute_forward_mean_f32( assert(src0->nb[0] == sizeof(float)); - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS assert(ne0 == 1); assert(ne1 == ne01); @@ -10068,7 +10374,7 @@ static void ggml_compute_forward_repeat_f32( return; } - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS // guaranteed to be an integer due to the check in ggml_can_repeat const int nr0 = (int)(ne0/ne00); @@ -10100,11 +10406,61 @@ static void ggml_compute_forward_repeat_f32( } } +static void ggml_compute_forward_repeat_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_can_repeat(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_TENSOR_UNARY_OP_LOCALS; + + // guaranteed to be an integer due to the check in ggml_can_repeat + const int nr0 = (int)(ne0/ne00); + const int nr1 = (int)(ne1/ne01); + const int nr2 = (int)(ne2/ne02); + const int nr3 = (int)(ne3/ne03); + + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + // TODO: maybe this is not optimal? + for (int i3 = 0; i3 < nr3; i3++) { + for (int k3 = 0; k3 < ne03; k3++) { + for (int i2 = 0; i2 < nr2; i2++) { + for (int k2 = 0; k2 < ne02; k2++) { + for (int i1 = 0; i1 < nr1; i1++) { + for (int k1 = 0; k1 < ne01; k1++) { + for (int i0 = 0; i0 < nr0; i0++) { + ggml_fp16_t * y = (ggml_fp16_t *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0); + ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01); + // ggml_vec_cpy_f16(ne00, y, x) + for (int i = 0; i < ne00; ++i) { + y[i] = x[i]; + } + } + } + } + } + } + } + } +} + static void ggml_compute_forward_repeat( const struct ggml_compute_params * params, const struct ggml_tensor * src0, struct ggml_tensor * dst) { switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_repeat_f16(params, src0, dst); + } break; case GGML_TYPE_F32: { ggml_compute_forward_repeat_f32(params, src0, dst); @@ -10129,7 +10485,7 @@ static void ggml_compute_forward_repeat_back_f32( return; } - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS // guaranteed to be an integer due to the check in ggml_can_repeat const int nr0 = (int)(ne00/ne0); @@ -10207,7 +10563,7 @@ static void ggml_compute_forward_concat_f32( const int ith = params->ith; - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS // TODO: support for transposed / permuted tensors GGML_ASSERT(nb0 == sizeof(float)); @@ -10809,7 +11165,7 @@ static void ggml_compute_forward_norm_f32( const int ith = params->ith; const int nth = params->nth; - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS float eps; memcpy(&eps, dst->op_params, sizeof(float)); @@ -10878,7 +11234,7 @@ static void ggml_compute_forward_rms_norm_f32( const int ith = params->ith; const int nth = params->nth; - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS float eps; memcpy(&eps, dst->op_params, sizeof(float)); @@ -10943,7 +11299,7 @@ static void ggml_compute_forward_rms_norm_back_f32( const int ith = params->ith; const int nth = params->nth; - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS float eps; memcpy(&eps, dst->op_params, sizeof(float)); @@ -11118,7 +11474,7 @@ static void ggml_compute_forward_group_norm_f32( const int ith = params->ith; const int nth = params->nth; - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS const float eps = 1e-6f; // TODO: make this a parameter @@ -11229,7 +11585,7 @@ static void ggml_compute_forward_mul_mat( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -11439,10 +11795,10 @@ static void ggml_compute_forward_out_prod_f32( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_time_us(); - UNUSED(t0); + // int64_t t0 = ggml_perf_time_us(); + // UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -11481,6 +11837,146 @@ static void ggml_compute_forward_out_prod_f32( return; } + // dst[:,:,:,:] = 0 + // for i2,i3: + // for i1: + // for i01: + // for i0: + // dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3] + + // parallelize by last three dimensions + + // total rows in dst + const int64_t nr = ne1*ne2*ne3; + + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; + + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); + + // block-tiling attempt + const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32); + const int64_t blck_1 = 16; + + for (int64_t bir = ir0; bir < ir1; bir += blck_1) { + const int64_t bir1 = MIN(bir + blck_1, ir1); + for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) { + const int64_t bne01 = MIN(bi01 + blck_0, ne01); + for (int64_t ir = bir; ir < bir1; ++ir) { + // dst indices + const int64_t i3 = ir/(ne2*ne1); + const int64_t i2 = (ir - i3*ne2*ne1)/ne1; + const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1); + + const int64_t i02 = i2; + const int64_t i03 = i3; + + //const int64_t i10 = i1; + const int64_t i12 = i2; + const int64_t i13 = i3; + +#if GGML_VEC_MAD_UNROLL > 2 + const int64_t bne01_unroll = bne01 - (bne01 % GGML_VEC_MAD_UNROLL); + for (int64_t i01 = bi01; i01 < bne01_unroll; i01 += GGML_VEC_MAD_UNROLL) { + const int64_t i11 = i01; + + float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + + ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1); + } + for (int64_t i01 = bne01_unroll; i01 < bne01; ++i01) { + const int64_t i11 = i01; + + float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + + ggml_vec_mad_f32(ne0, d, s0, *s1); + } +#else + for (int64_t i01 = bi01; i01 < bne01; ++i01) { + const int64_t i11 = i01; + + float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + + ggml_vec_mad_f32(ne0, d, s0, *s1); + } +#endif + } + } + } + + + //int64_t t1 = ggml_perf_time_us(); + //static int64_t acc = 0; + //acc += t1 - t0; + //if (t1 - t0 > 10) { + // printf("\n"); + // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); + // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); + // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); + // printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13); + + // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); + //} +} + +static void ggml_compute_forward_out_prod_q_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + // int64_t t0 = ggml_perf_time_us(); + // UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; + + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 dim0 + GGML_ASSERT(nb00 == ggml_type_size(type)); + + // dst dim0 cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + // GGML_ASSERT(nb0 <= nb1); + // GGML_ASSERT(nb1 <= nb2); + // GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ne0 == ne00); + GGML_ASSERT(ne1 == ne10); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne3 == ne03); + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod + // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) + + if (params->type == GGML_TASK_INIT) { + ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + // parallelize by last three dimensions // total rows in dst @@ -11500,6 +11996,8 @@ static void ggml_compute_forward_out_prod_f32( // for i0: // dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3] + float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith; + for (int64_t ir = ir0; ir < ir1; ++ir) { // dst indices const int64_t i3 = ir/(ne2*ne1); @@ -11520,10 +12018,8 @@ static void ggml_compute_forward_out_prod_f32( float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); - ggml_vec_mad_f32(ne0, d, s0, *s1); - // for (int64_t i0 = 0; i0 < ne0; ++i0) { - // d[i0] += s0[i0] * s1[i1]; - // } + dequantize_row_q(s0, wdata, ne0); + ggml_vec_mad_f32(ne0, d, wdata, *s1); } } @@ -11552,10 +12048,13 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: { - GGML_ASSERT(false); // todo - // ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); + ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); } break; case GGML_TYPE_F16: { @@ -11673,8 +12172,8 @@ static void ggml_compute_forward_set_f32( const int nr = ggml_nrows(src1); const int nc = src1->ne[0]; - GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); - GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) // src0 and dst as viewed during set const size_t nb0 = ggml_element_size(src0); @@ -11943,14 +12442,15 @@ static void ggml_compute_forward_get_rows_back_f32_f16( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { GGML_ASSERT(params->ith == 0); - GGML_ASSERT(ggml_are_same_shape(opt0, dst)); - GGML_ASSERT(ggml_is_contiguous(opt0)); GGML_ASSERT(ggml_is_contiguous(dst)); - ggml_compute_forward_dup_same_cont(params, opt0, dst); + // ggml_compute_forward_dup_same_cont(params, opt0, dst); + + if (params->type == GGML_TASK_INIT) { + memset(dst->data, 0, ggml_nbytes(dst)); + } if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -11976,11 +12476,8 @@ static void ggml_compute_forward_get_rows_back_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { GGML_ASSERT(params->ith == 0); - GGML_ASSERT(ggml_are_same_shape(opt0, dst)); - GGML_ASSERT(ggml_is_contiguous(opt0)); GGML_ASSERT(ggml_is_contiguous(dst)); // ggml_compute_forward_dup_same_cont(params, opt0, dst); @@ -12014,16 +12511,15 @@ static void ggml_compute_forward_get_rows_back( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, opt0, dst); + ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_get_rows_back_f32(params, src0, src1, opt0, dst); + ggml_compute_forward_get_rows_back_f32(params, src0, src1, dst); } break; default: { @@ -12064,7 +12560,7 @@ static void ggml_compute_forward_diag_f32( // TODO: handle transposed/permuted matrices - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS GGML_ASSERT(ne00 == ne0); GGML_ASSERT(ne00 == ne1); @@ -12452,13 +12948,11 @@ static void ggml_compute_forward_alibi_f16( return; } - const int n_past = ((int32_t *) dst->op_params)[0]; + //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_head = ((int32_t *) dst->op_params)[1]; float max_bias; memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); - assert(n_past >= 0); - const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 const int ne1 = src0->ne[1]; // seq_len_without_past const int ne2 = src0->ne[2]; // n_head -> this is k @@ -12473,7 +12967,7 @@ static void ggml_compute_forward_alibi_f16( //const int nb3 = src0->nb[3]; GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(ne1 + n_past == ne0); (void) n_past; + //GGML_ASSERT(ne1 + n_past == ne0); (void) n_past; GGML_ASSERT(n_head == ne2); // add alibi to src0 (KQ_scaled) @@ -12619,8 +13113,8 @@ static void ggml_compute_forward_clamp( static void ggml_compute_forward_rope_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, + const struct ggml_tensor * src1, struct ggml_tensor * dst) { - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -12630,9 +13124,9 @@ static void ggml_compute_forward_rope_f32( // these two only relevant for xPos RoPE: float xpos_base; - bool xpos_down; + bool xpos_down; - const int n_past = ((int32_t *) dst->op_params)[0]; + //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; const int n_ctx = ((int32_t *) dst->op_params)[3]; @@ -12641,9 +13135,7 @@ static void ggml_compute_forward_rope_f32( memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); - assert(n_past >= 0); - - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -12673,9 +13165,11 @@ static void ggml_compute_forward_rope_f32( const bool is_neox = mode & 2; const bool is_glm = mode & 4; + const int32_t * pos = (const int32_t *) src1->data; + for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { - const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); + for (int64_t i2 = 0; i2 < ne2; i2++) { + const int64_t p = pos[i2]; for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; @@ -12712,7 +13206,7 @@ static void ggml_compute_forward_rope_f32( const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); // zeta scaling for xPos only: - float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f; + float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; if (xpos_down) zeta = 1.0f / zeta; theta *= theta_scale; @@ -12757,8 +13251,8 @@ static void ggml_compute_forward_rope_f32( static void ggml_compute_forward_rope_f16( const struct ggml_compute_params * params, const struct ggml_tensor * src0, + const struct ggml_tensor * src1, struct ggml_tensor * dst) { - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -12766,16 +13260,14 @@ static void ggml_compute_forward_rope_f16( float freq_base; float freq_scale; - const int n_past = ((int32_t *) dst->op_params)[0]; + //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; const int n_ctx = ((int32_t *) dst->op_params)[3]; memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - assert(n_past >= 0); - - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -12805,9 +13297,11 @@ static void ggml_compute_forward_rope_f16( const bool is_neox = mode & 2; const bool is_glm = mode & 4; + const int32_t * pos = (const int32_t *) src1->data; + for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { - const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); + for (int64_t i2 = 0; i2 < ne2; i2++) { + const int64_t p = pos[i2]; for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; @@ -12886,15 +13380,16 @@ static void ggml_compute_forward_rope_f16( static void ggml_compute_forward_rope( const struct ggml_compute_params * params, const struct ggml_tensor * src0, + const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_rope_f16(params, src0, dst); + ggml_compute_forward_rope_f16(params, src0, src1, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_rope_f32(params, src0, dst); + ggml_compute_forward_rope_f32(params, src0, src1, dst); } break; default: { @@ -12908,6 +13403,7 @@ static void ggml_compute_forward_rope( static void ggml_compute_forward_rope_back_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, + const struct ggml_tensor * src1, struct ggml_tensor * dst) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -12925,7 +13421,7 @@ static void ggml_compute_forward_rope_back_f32( float xpos_base; bool xpos_down; - const int n_past = ((int32_t *) dst->op_params)[0]; + //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx); @@ -12934,9 +13430,7 @@ static void ggml_compute_forward_rope_back_f32( memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); - assert(n_past >= 0); - - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -12962,9 +13456,11 @@ static void ggml_compute_forward_rope_back_f32( const bool is_neox = mode & 2; + const int32_t * pos = (const int32_t *) src1->data; + for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { - const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); + for (int64_t i2 = 0; i2 < ne2; i2++) { + const int64_t p = pos[i2]; for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; @@ -12976,7 +13472,7 @@ static void ggml_compute_forward_rope_back_f32( const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); // zeta scaling for xPos only: - float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f; + float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; if (xpos_down) zeta = 1.0f / zeta; theta *= theta_scale; @@ -13019,6 +13515,7 @@ static void ggml_compute_forward_rope_back_f32( static void ggml_compute_forward_rope_back_f16( const struct ggml_compute_params * params, const struct ggml_tensor * src0, + const struct ggml_tensor * src1, struct ggml_tensor * dst) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -13029,13 +13526,11 @@ static void ggml_compute_forward_rope_back_f16( // dx = rope_back(dy, src1) // src0 is dy, src1 contains options - const int n_past = ((int32_t *) dst->op_params)[0]; + //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; - assert(n_past >= 0); - - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -13061,9 +13556,11 @@ static void ggml_compute_forward_rope_back_f16( const bool is_neox = mode & 2; + const int32_t * pos = (const int32_t *) src1->data; + for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { - const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); + for (int64_t i2 = 0; i2 < ne2; i2++) { + const int64_t p = pos[i2]; for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; @@ -13115,15 +13612,16 @@ static void ggml_compute_forward_rope_back_f16( static void ggml_compute_forward_rope_back( const struct ggml_compute_params * params, const struct ggml_tensor * src0, + const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_rope_back_f16(params, src0, dst); + ggml_compute_forward_rope_back_f16(params, src0, src1, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_rope_back_f32(params, src0, dst); + ggml_compute_forward_rope_back_f32(params, src0, src1, dst); } break; default: { @@ -13146,7 +13644,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -13237,7 +13735,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -13349,7 +13847,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -13440,7 +13938,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -13558,7 +14056,7 @@ static void ggml_compute_forward_conv_1d( ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst); } else { GGML_ASSERT(false); // only stride 1 and 2 supported - }; + } } // ggml_compute_forward_conv_2d @@ -13575,7 +14073,7 @@ static void ggml_compute_forward_conv_2d_f16_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -13695,7 +14193,7 @@ static void ggml_compute_forward_conv_transpose_2d( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -13954,7 +14452,7 @@ static void ggml_compute_forward_upscale_f32( const int ith = params->ith; - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS const int scale_factor = dst->op_params[0]; @@ -14006,14 +14504,14 @@ static void ggml_compute_forward_flash_attn_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_LOCALS(int64_t, neq, q, ne); - GGML_TENSOR_LOCALS(size_t, nbq, q, nb); - GGML_TENSOR_LOCALS(int64_t, nek, k, ne); - GGML_TENSOR_LOCALS(size_t, nbk, k, nb); - GGML_TENSOR_LOCALS(int64_t, nev, v, ne); - GGML_TENSOR_LOCALS(size_t, nbv, v, nb); - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + GGML_TENSOR_LOCALS(int64_t, neq, q, ne) + GGML_TENSOR_LOCALS(size_t, nbq, q, nb) + GGML_TENSOR_LOCALS(int64_t, nek, k, ne) + GGML_TENSOR_LOCALS(size_t, nbk, k, nb) + GGML_TENSOR_LOCALS(int64_t, nev, v, ne) + GGML_TENSOR_LOCALS(size_t, nbv, v, nb) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) const int ith = params->ith; const int nth = params->nth; @@ -14083,10 +14581,11 @@ static void ggml_compute_forward_flash_attn_f32( S[i] = -INFINITY; } - for (int64_t ic = 0; ic < nek1; ++ic) { + const int64_t masked_begin = masked ? (P + iq1 + 1) : M; + for (int64_t ic = 0; ic < masked_begin; ++ic) { // k indices const int ik3 = iq3; - const int ik2 = iq2; + const int ik2 = iq2 % nek2; const int ik1 = ic; // S indices @@ -14099,20 +14598,18 @@ static void ggml_compute_forward_flash_attn_f32( } // scale - ggml_vec_scale_f32(nek1, S, scale); + ggml_vec_scale_f32(masked_begin, S, scale); - if (masked) { - for (int64_t i = P; i < M; i++) { - if (i > P + iq1) { - S[i] = -INFINITY; - } - } + for (int64_t i = masked_begin; i < M; i++) { + S[i] = -INFINITY; } // softmax + // exclude known -INF S[..] values from max and loop + // dont forget to set their SW values to zero { float max = -INFINITY; - ggml_vec_max_f32(M, &max, S); + ggml_vec_max_f32(masked_begin, &max, S); ggml_float sum = 0.0; { @@ -14126,10 +14623,15 @@ static void ggml_compute_forward_flash_attn_f32( ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { + if (i >= masked_begin) { + break; + } float * SS = S + i; for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { - if (SS[j] == -INFINITY) { + if (i + j >= masked_begin) { + break; + } else if (SS[j] == -INFINITY) { SS[j] = 0.0f; } else { #ifndef GGML_FLASH_ATTN_EXP_FP16 @@ -14154,10 +14656,10 @@ static void ggml_compute_forward_flash_attn_f32( assert(sum > 0.0); sum = 1.0/sum; - ggml_vec_scale_f32(M, S, sum); + ggml_vec_scale_f32(masked_begin, S, sum); #ifndef NDEBUG - for (int i = 0; i < M; ++i) { + for (int i = 0; i < masked_begin; ++i) { assert(!isnan(S[i])); assert(!isinf(S[i])); } @@ -14170,9 +14672,13 @@ static void ggml_compute_forward_flash_attn_f32( const int i2 = iq2; const int i3 = iq3; - ggml_vec_dot_f32(nek1, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (float *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), + // v indices + const int iv2 = iq2 % nev2; + const int iv3 = iq3; + + ggml_vec_dot_f32(masked_begin, + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), + (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), S); } } @@ -14188,14 +14694,14 @@ static void ggml_compute_forward_flash_attn_f16( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_LOCALS(int64_t, neq, q, ne); - GGML_TENSOR_LOCALS(size_t, nbq, q, nb); - GGML_TENSOR_LOCALS(int64_t, nek, k, ne); - GGML_TENSOR_LOCALS(size_t, nbk, k, nb); - GGML_TENSOR_LOCALS(int64_t, nev, v, ne); - GGML_TENSOR_LOCALS(size_t, nbv, v, nb); - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + GGML_TENSOR_LOCALS(int64_t, neq, q, ne) + GGML_TENSOR_LOCALS(size_t, nbq, q, nb) + GGML_TENSOR_LOCALS(int64_t, nek, k, ne) + GGML_TENSOR_LOCALS(size_t, nbk, k, nb) + GGML_TENSOR_LOCALS(int64_t, nev, v, ne) + GGML_TENSOR_LOCALS(size_t, nbv, v, nb) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) const int ith = params->ith; const int nth = params->nth; @@ -14269,7 +14775,7 @@ static void ggml_compute_forward_flash_attn_f16( for (int64_t ic = 0; ic < nek1; ++ic) { // k indices const int ik3 = iq3; - const int ik2 = iq2; + const int ik2 = iq2 % nek2; const int ik1 = ic; // S indices @@ -14284,7 +14790,7 @@ static void ggml_compute_forward_flash_attn_f16( for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) { // k indices const int ik3 = iq3; - const int ik2 = iq2; + const int ik2 = iq2 % nek2; const int ik1 = ic; // S indices @@ -14309,6 +14815,8 @@ static void ggml_compute_forward_flash_attn_f16( } // softmax + // todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero. + // dont forget to set their S values to zero { float max = -INFINITY; ggml_vec_max_f32(M, &max, S); @@ -14365,6 +14873,7 @@ static void ggml_compute_forward_flash_attn_f16( S16[i] = GGML_FP32_TO_FP16(S[i]); } + // todo: exclude known zero S[..] values from dot (reducing nev0 and increasing begin of v and S16). if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) { for (int64_t ic = 0; ic < nev1; ++ic) { // dst indices @@ -14372,9 +14881,13 @@ static void ggml_compute_forward_flash_attn_f16( const int i2 = iq2; const int i3 = iq3; - ggml_vec_dot_f16(nek1, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), + // v indices + const int iv2 = iq2 % nev2; + const int iv3 = iq3; + + ggml_vec_dot_f16(nev0, + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), + (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), S16); } } else { @@ -14384,9 +14897,13 @@ static void ggml_compute_forward_flash_attn_f16( const int i2 = iq2; const int i3 = iq3; - ggml_vec_dot_f16_unroll(nek1, nbv1, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), + // v indices + const int iv2 = iq2 % nev2; + const int iv3 = iq3; + + ggml_vec_dot_f16_unroll(nev0, nbv1, + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), + ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), S16); } } @@ -14429,18 +14946,18 @@ static void ggml_compute_forward_flash_ff_f16( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_LOCALS(int64_t, nea, a, ne); - GGML_TENSOR_LOCALS(size_t, nba, a, nb); - GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne); - GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb); - GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne); - GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb); - GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne); - GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb); - GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne); - GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb); - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + GGML_TENSOR_LOCALS(int64_t, nea, a, ne) + GGML_TENSOR_LOCALS(size_t, nba, a, nb) + GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne) + GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb) + GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne) + GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb) + GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne) + GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb) + GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne) + GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) const int ith = params->ith; const int nth = params->nth; @@ -14588,16 +15105,16 @@ static void ggml_compute_forward_flash_attn_back_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_LOCALS(int64_t, neq, q, ne); - GGML_TENSOR_LOCALS(size_t, nbq, q, nb); - GGML_TENSOR_LOCALS(int64_t, nek, k, ne); - GGML_TENSOR_LOCALS(size_t, nbk, k, nb); - GGML_TENSOR_LOCALS(int64_t, nev, v, ne); - GGML_TENSOR_LOCALS(size_t, nbv, v, nb); - GGML_TENSOR_LOCALS(int64_t, ned, d, ne); - GGML_TENSOR_LOCALS(size_t, nbd, d, nb); - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + GGML_TENSOR_LOCALS(int64_t, neq, q, ne) + GGML_TENSOR_LOCALS(size_t, nbq, q, nb) + GGML_TENSOR_LOCALS(int64_t, nek, k, ne) + GGML_TENSOR_LOCALS(size_t, nbk, k, nb) + GGML_TENSOR_LOCALS(int64_t, nev, v, ne) + GGML_TENSOR_LOCALS(size_t, nbv, v, nb) + GGML_TENSOR_LOCALS(int64_t, ned, d, ne) + GGML_TENSOR_LOCALS(size_t, nbd, d, nb) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) const int ith = params->ith; const int nth = params->nth; @@ -14645,10 +15162,37 @@ static void ggml_compute_forward_flash_attn_back_f32( return; } - // parallelize by q rows using ggml_vec_dot_f32 + const int64_t elem_q = ggml_nelements(q); + const int64_t elem_k = ggml_nelements(k); - // total rows in q - const int nr = neq2*neq3; + enum ggml_type result_type = dst->type; + GGML_ASSERT(ggml_blck_size(result_type) == 1); + const size_t tsize = ggml_type_size(result_type); + + const size_t offs_q = 0; + const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN); + const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN); + + void * grad_q = (char *) dst->data; + void * grad_k = (char *) dst->data + offs_k; + void * grad_v = (char *) dst->data + offs_v; + + const size_t nbgq1 = nb0*neq0; + const size_t nbgq2 = nb0*neq0*neq1; + const size_t nbgq3 = nb0*neq0*neq1*neq2; + + const size_t nbgk1 = nb0*nek0; + const size_t nbgk2 = nb0*nek0*nek1; + const size_t nbgk3 = nb0*nek0*nek1*neq2; + + const size_t nbgv1 = nb0*nev0; + const size_t nbgv2 = nb0*nev0*nev1; + const size_t nbgv3 = nb0*nev0*nev1*neq2; + + // parallelize by k rows using ggml_vec_dot_f32 + + // total rows in k + const int nr = nek2*nek3; // rows per thread const int dr = (nr + nth - 1)/nth; @@ -14661,268 +15205,243 @@ static void ggml_compute_forward_flash_attn_back_f32( //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); + // how often k2 (and v2) is repeated in q2 + int nrep = neq2/nek2; + for (int ir = ir0; ir < ir1; ++ir) { // q indices - const int iq3 = ir/(neq2); - const int iq2 = ir - iq3*neq2; - for ( int iq1 = 0; iq1 < neq1; ++iq1) { + const int ik3 = ir/(nek2); + const int ik2 = ir - ik3*nek2; + const int iq3 = ik3; + const int id3 = ik3; + const int iv3 = ik3; + const int iv2 = ik2; - // not sure about CACHE_LINE_SIZE_F32.. - // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset? - float * S = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32); - float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32); + for (int irep = 0; irep < nrep; ++irep) { + const int iq2 = ik2 + irep*nek2; + const int id2 = iq2; - for (int i = M; i < Mup; ++i) { - S[i] = -INFINITY; - } + // (ik2 + irep*nek2) % nek2 == ik2 + for (int iq1 = 0; iq1 < neq1; ++iq1) { + const int id1 = iq1; - for (int64_t ic = 0; ic < nek1; ++ic) { - // k indices - const int ik3 = iq3; - const int ik2 = iq2; - const int ik1 = ic; + // not sure about CACHE_LINE_SIZE_F32.. + // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset? + float * S = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32); + float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32); - // S indices - const int i1 = ik1; + for (int i = M; i < Mup; ++i) { + S[i] = -INFINITY; + } - ggml_vec_dot_f32(neq0, - S + i1, - (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); - } + const int64_t masked_begin = masked ? (P + iq1 + 1) : M; + for (int64_t ic = 0; ic < masked_begin; ++ic) { + // k indices + const int ik1 = ic; - // scale - ggml_vec_scale_f32(nek1, S, scale); + // S indices + const int i1 = ik1; - if (masked) { - for (int64_t i = P; i < M; i++) { - if (i > P + iq1) { - S[i] = -INFINITY; - } + ggml_vec_dot_f32(neq0, + S + i1, + (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); } - } - // softmax - { - float max = -INFINITY; - ggml_vec_max_f32(M, &max, S); + // scale + ggml_vec_scale_f32(masked_begin, S, scale); - ggml_float sum = 0.0; + for (int64_t i = masked_begin; i < M; i++) { + S[i] = -INFINITY; + } + + // softmax + // exclude known -INF S[..] values from max and loop + // dont forget to set their SM values to zero { + float max = -INFINITY; + ggml_vec_max_f32(masked_begin, &max, S); + + ggml_float sum = 0.0; + { #ifdef GGML_SOFT_MAX_ACCELERATE - max = -max; - vDSP_vsadd(SM, 1, &max, SM, 1, Mup); - vvexpf(SM, SM, &Mup); - ggml_vec_sum_f32(Mup, &sum, SM); + max = -max; + vDSP_vsadd(SM, 1, &max, SM, 1, Mup); + vvexpf(SM, SM, &Mup); + ggml_vec_sum_f32(Mup, &sum, SM); #else - uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt); - ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; - - for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { - float * SR = S + i; - float * SW = SM + i; + uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt); + ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; - for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { - if (SR[j] == -INFINITY) { - SW[j] = 0.0f; - } else { + for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { + if (i >= masked_begin) { + break; + } + float * SR = S + i; + float * SW = SM + i; + + for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { + if (i + j >= masked_begin) { + break; + } else if (SR[j] == -INFINITY) { + SW[j] = 0.0f; + } else { #ifndef GGML_FLASH_ATTN_EXP_FP16 - const float val = expf(SR[j] - max); + const float val = expf(SR[j] - max); #else - ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max); - memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); + ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max); + memcpy(&scvt[j], &s, sizeof(uint16_t)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); #endif - sump[j] += (ggml_float)val; - SW[j] = val; + sump[j] += (ggml_float)val; + SW[j] = val; + } } } - } - for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { - sum += sump[i]; - } + for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { + sum += sump[i]; + } #endif - } - - assert(sum > 0.0); - - sum = 1.0/sum; - ggml_vec_scale_f32(M, SM, sum); - - } - - // step-by-step explanation - { - // forward-process shape grads from backward process - // parallel_for iq2,iq3: - // k[:D,:M,:,:] [D,M,:,:] grad[k][:D,:M,iq2,iq3] += grad[kcur] - // q[:D,:N,:,:] [D,N,:,:] grad[q][:D,iq1,iq2,iq3] += grad[qcur] - // v[:M,:D,:,:] [M,D,:,:] grad[v][:M,:D,iq2,iq3] += grad[vcur] - // for iq1: - // kcur = k[:D,:M,iq2,iq3] [D,M,1,1] grad[kcur] = grad[S1].T @ qcur - // qcur = q[:D,iq1,iq2,iq3] [D,1,1,1] grad[qcur] = grad[S1] @ kcur - // vcur = v[:M,:D,iq2,iq3] [M,D,1,1] grad[vcur] = grad[S5].T @ S4 - // S0 = -Inf [D,1,1,1] - // ~S1[i] = dot(kcur[:D,i], qcur) - // S1 = qcur @ kcur.T [M,1,1,1] grad[S1] = grad[S2] * scale - // S2 = S1 * scale [M,1,1,1] grad[S2] = diag_mask_zero(grad[S3], P) - // S3 = diag_mask_inf(S2, P) [M,1,1,1] grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) - // S4 = softmax(S3) [M,1,1,1] grad[S4] = grad[S5] @ vcur - // ~S5[i] = dot(vcur[:,i], S4) - // S5 = S4 @ vcur.T [D,1,1,1] grad[S5] = d[:D,iq1,iq2,iq3] - // ~dst[i,iq1,iq2,iq3] = S5[i] ^ - // dst[:D,iq1,iq2,iq3] = S5 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,iq1,iq2,iq3] - // dst backward-/ grad[dst] = d - // - // output gradients with their dependencies: - // - // grad[kcur] = grad[S1].T @ qcur - // grad[S1] = diag_mask_zero(grad[S3], P) * scale - // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) - // grad[S4] = grad[S5] @ vcur - // grad[S4] = d[:D,iq1,iq2,iq3] @ vcur - // grad[qcur] = grad[S1] @ kcur - // grad[vcur] = grad[S5].T @ S4 - // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4 - // - // in post-order: - // - // S1 = qcur @ kcur.T - // S2 = S1 * scale - // S3 = diag_mask_inf(S2, P) - // S4 = softmax(S3) - // grad[S4] = d[:D,iq1,iq2,iq3] @ vcur - // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) - // grad[S1] = diag_mask_zero(grad[S3], P) * scale - // grad[qcur] = grad[S1] @ kcur - // grad[kcur] = grad[S1].T @ qcur - // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4 - // - // using less variables (SM=S4): - // - // S = diag_mask_inf(qcur @ kcur.T * scale, P) - // SM = softmax(S) - // S = d[:D,iq1,iq2,iq3] @ vcur - // dot_SM_gradSM = dot(SM, S) - // S = SM * (S - dot(SM, S)) - // S = diag_mask_zero(S, P) * scale - // - // grad[q][:D,iq1,iq2,iq3] += S @ kcur - // grad[k][:D,:M,iq2,iq3] += S.T @ qcur - // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T @ SM - } - - // S = gradSM = d[:D,iq1,iq2,iq3] @ vcur - // S = d[:D,iq1,iq2,iq3] @ vcur - // S[:M] += vcur[:M,ic] * d[ic,iq1,iq2,iq3] - ggml_vec_set_f32(M, S, 0); - for (int64_t ic = 0; ic < D; ++ic) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; + } - ggml_vec_mad_f32(M, - S, - (float *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), - *(float *) ((char *) d->data + (ic*nbd0 + i1*nbd1 + i2*nbd2 + i3*nbd3))); - } + assert(sum > 0.0); - // S = SM * (S - dot(SM, S)) - float dot_SM_gradSM = 0; - ggml_vec_dot_f32 (M, &dot_SM_gradSM, SM, S); - ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); - ggml_vec_mul_f32 (M, S, S, SM); + sum = 1.0/sum; + ggml_vec_scale_f32(masked_begin, SM, sum); - // S = diag_mask_zero(S, P) * scale - if (masked) { - // for (int64_t i = P + iq1 + 1; i < M; i++) { - // S[i] = 0; - // } - for (int64_t i = P; i < M; i++) { - if (i > P + iq1) { - S[i] = 0; - } } - } - ggml_vec_scale_f32(M, S, scale); - - void * grad_q = (char *) dst->data; - void * grad_k = (char *) dst->data + nb0*D*N*neq2*neq3; - void * grad_v = (char *) dst->data + nb0*D*N*neq2*neq3 + nb0*D*M*neq2*neq3; - - const size_t nbgq1 = nb0*neq0; - const size_t nbgq2 = nb0*neq0*neq1; - const size_t nbgq3 = nb0*neq0*neq1*neq2; - - const size_t nbgk1 = nb0*nek0; - const size_t nbgk2 = nb0*nek0*nek1; - const size_t nbgk3 = nb0*nek0*nek1*neq2; - - const size_t nbgv1 = nb0*nev0; - const size_t nbgv2 = nb0*nev0*nev1; - const size_t nbgv3 = nb0*nev0*nev1*neq2; - - // S shape [M,1] - // SM shape [M,1] - // kcur shape [D,M] - // qcur shape [D,1] - // vcur shape [M,D] - // - // grad[q][:D,iq1,iq2,iq3] += S @ kcur - // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M] - // grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic] - // - //// grad[q][ic,iq1,iq2,iq3] += dot(kcur[:,ic],S.T) - //// grad[q][ic,iq1,iq2,iq3] += dot(k[:D,ic,iq2,iq3],S.T) - for (int64_t ic = 0; ic < M; ++ic) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; - ggml_vec_mad_f32(D, - (float *) ((char *) grad_q + (i1*nbgq1 + i2*nbgq2 + i3*nbgq3)), - (float *) ((char *) k->data + (ic*nbk1 + i2*nbk2 + i3*nbk3)), - S[ic]); - } + // step-by-step explanation + { + // forward-process shape grads from backward process + // parallel_for ik2,ik3: + // for irep: + // iq2 = ik2 + irep*nek2 + // k[:D,:M,:,:] [D,M,:,:] grad[k][:D,:M,ik2,ik3] += grad[kcur] + // q[:D,:N,:,:] [D,N,:,:] grad[q][:D,iq1,iq2,iq3] += grad[qcur] + // v[:M,:D,:,:] [M,D,:,:] grad[v][:M,:D,iv2,iv3] += grad[vcur] + // for iq1: + // kcur = k[:D,:M,ik2,ik3] [D,M,1,1] grad[kcur] = grad[S1].T @ qcur + // qcur = q[:D,iq1,iq2,iq3] [D,1,1,1] grad[qcur] = grad[S1] @ kcur + // vcur = v[:M,:D,iv2,iv3] [M,D,1,1] grad[vcur] = grad[S5].T @ S4 + // S0 = -Inf [D,1,1,1] + // ~S1[i] = dot(kcur[:D,i], qcur) + // S1 = qcur @ kcur.T [M,1,1,1] grad[S1] = grad[S2] * scale + // S2 = S1 * scale [M,1,1,1] grad[S2] = diag_mask_zero(grad[S3], P) + // S3 = diag_mask_inf(S2, P) [M,1,1,1] grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // S4 = softmax(S3) [M,1,1,1] grad[S4] = grad[S5] @ vcur + // ~S5[i] = dot(vcur[:,i], S4) + // S5 = S4 @ vcur.T [D,1,1,1] grad[S5] = d[:D,id1,id2,id3] + // ~dst[i,iq1,iq2,iq3] = S5[i] ^ + // dst[:D,iq1,iq2,iq3] = S5 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,id1,id2,id3] + // dst backward-/ grad[dst] = d + // + // output gradients with their dependencies: + // + // grad[kcur] = grad[S1].T @ qcur + // grad[S1] = diag_mask_zero(grad[S3], P) * scale + // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // grad[S4] = grad[S5] @ vcur + // grad[S4] = d[:D,id1,id2,id3] @ vcur + // grad[qcur] = grad[S1] @ kcur + // grad[vcur] = grad[S5].T @ S4 + // grad[vcur] = d[:D,id1,id2,id3].T @ S4 + // + // in post-order: + // + // S1 = qcur @ kcur.T + // S2 = S1 * scale + // S3 = diag_mask_inf(S2, P) + // S4 = softmax(S3) + // grad[S4] = d[:D,id1,id2,id3] @ vcur + // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // grad[S1] = diag_mask_zero(grad[S3], P) * scale + // grad[qcur] = grad[S1] @ kcur + // grad[kcur] = grad[S1].T @ qcur + // grad[vcur] = d[:D,id1,id2,id3].T @ S4 + // + // using less variables (SM=S4): + // + // S = diag_mask_inf(qcur @ kcur.T * scale, P) + // SM = softmax(S) + // S = d[:D,iq1,iq2,iq3] @ vcur + // dot_SM_gradSM = dot(SM, S) + // S = SM * (S - dot(SM, S)) + // S = diag_mask_zero(S, P) * scale + // + // grad[q][:D,iq1,iq2,iq3] += S @ kcur + // grad[k][:D,:M,ik2,ik3] += S.T @ qcur + // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T @ SM + } - // grad[k][:D,:M,iq2,iq3] += S.T @ qcur - // grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0] - // grad[k][:D,ic,iq2,iq3] += S[ic] * qcur[:D,0] - for (int64_t ic = 0; ic < M; ++ic) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; + // S = gradSM = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3] + // S = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3] + // for ic: + // S[:M] += vcur[:M,ic,iv2,iv3] * d[ic,id1,id2,id3] + // exclude known future zero S[..] values from operation + ggml_vec_set_f32(masked_begin, S, 0); + for (int64_t ic = 0; ic < D; ++ic) { + ggml_vec_mad_f32(masked_begin, + S, + (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), + *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3))); + } - // ggml_vec_set_f32(D, - // (float *) ((char *) grad_k + (ic*nbgk1 + i2*nbgk2 + i3*nbgk3)), - // 0); - ggml_vec_mad_f32(D, - (float *) ((char *) grad_k + (ic*nbgk1 + i2*nbgk2 + i3*nbgk3)), - (float *) ((char *) q->data + (i1*nbq1 + i2*nbq2 + i3*nbq3)), - S[ic]); - } + // S = SM * (S - dot(SM, S)) + float dot_SM_gradSM = 0; + ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S); + ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); + ggml_vec_mul_f32 (masked_begin, S, S, SM); + + // S = diag_mask_zero(S, P) * scale + // already done by above ggml_vec_set_f32 + + // exclude known zero S[..] values from operation + ggml_vec_scale_f32(masked_begin, S, scale); + + // S shape [M,1] + // SM shape [M,1] + // kcur shape [D,M] + // qcur shape [D,1] + // vcur shape [M,D] + + // grad[q][:D,iq1,iq2,iq3] += S @ kcur + // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M] + // for ic: + // grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic,ik2,ik3] + // exclude known zero S[..] values from loop + for (int64_t ic = 0; ic < masked_begin; ++ic) { + ggml_vec_mad_f32(D, + (float *) ((char *) grad_q + (iq1*nbgq1 + iq2*nbgq2 + iq3*nbgq3)), + (float *) ((char *) k->data + (ic*nbk1 + ik2*nbk2 + ik3*nbk3)), + S[ic]); + } - // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T @ SM - // grad[v][:M,ic,iq2,iq3] += d[:D,iq1,iq2,iq3].T[0,ic] * SM[:M] - // grad[v][:M,ic,iq2,iq3] += d[ic,iq1,iq2,iq3] * SM[:M] - for (int64_t ic = 0; ic < D; ++ic) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; + // grad[k][:D,:M,iq2,iq3] += S.T @ qcur + // for ic: + // grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0] + // grad[k][:D,ic,iq2,iq3] += S[ic] * qcur[:D,0] + // exclude known zero S[..] values from loop + for (int64_t ic = 0; ic < masked_begin; ++ic) { + ggml_vec_mad_f32(D, + (float *) ((char *) grad_k + (ic*nbgk1 + ik2*nbgk2 + ik3*nbgk3)), + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), + S[ic]); + } - // ggml_vec_set_f32(M, - // (float *) ((char *) grad_v + ( ic*nbgv1 + i2*nbgv2 + i3*nbgv3)), - // 0); - ggml_vec_mad_f32(M, - (float *) ((char *) grad_v + ( ic*nbgv1 + i2*nbgv2 + i3*nbgv3)), - SM, - *(float *) ((char *) d->data + (ic*nbd0 + i1*nbd1 + i2*nbd2 + i3*nbd3))); + // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T @ SM + // for ic: + // grad[v][:M,ic,iv2,iv3] += d[:D,id1,id2,id3].T[0,ic] * SM[:M] + // grad[v][:M,ic,iv2,iv3] += d[ic,id1,id2,id3] * SM[:M] + // exclude known zero SM[..] values from mad + for (int64_t ic = 0; ic < D; ++ic) { + ggml_vec_mad_f32(masked_begin, + (float *) ((char *) grad_v + ( ic*nbgv1 + iv2*nbgv2 + iv3*nbgv3)), + SM, + *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3))); + } } } } @@ -14958,8 +15477,8 @@ static void ggml_compute_forward_win_part_f32( return; } - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) const int32_t nep0 = ((const int32_t *)(dst->op_params))[0]; const int32_t nep1 = ((const int32_t *)(dst->op_params))[1]; @@ -15020,8 +15539,8 @@ static void ggml_compute_forward_win_unpart_f32( return; } - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) const int32_t w = ((const int32_t *)(dst->op_params))[0]; @@ -15138,7 +15657,7 @@ static void ggml_compute_forward_get_rel_pos_f16( // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322 - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS const int64_t w = ne1; @@ -15836,7 +16355,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_GET_ROWS_BACK: { - ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_DIAG: { @@ -15860,11 +16379,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_ROPE: { - ggml_compute_forward_rope(params, tensor->src[0], tensor); + ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_ROPE_BACK: { - ggml_compute_forward_rope_back(params, tensor->src[0], tensor); + ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_ALIBI: { @@ -16009,7 +16528,218 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm //////////////////////////////////////////////////////////////////////////////// -static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) { +static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small"); + +static size_t hash(void * p) { + return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE; +} + +static size_t hash_find(void * hash_table[], void * p) { + size_t h = hash(p); + + // linear probing + size_t i = h; + while (hash_table[i] != NULL && hash_table[i] != p) { + i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE; + if (i == h) { + // visited all hash table entries -> not found + return GGML_GRAPH_HASHTABLE_SIZE; + } + } + return i; +} + +static bool hash_insert(void * hash_table[], void * p) { + size_t i = hash_find(hash_table, p); + + GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full + + if (hash_table[i] == p) { + return true; + } + + // insert + GGML_ASSERT(hash_table[i] == NULL); + hash_table[i] = p; + return false; +} + +static bool hash_contains(void * hash_table[], void * p) { + size_t i = hash_find(hash_table, p); + return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p); +} + +struct hash_map { + void * keys[GGML_GRAPH_HASHTABLE_SIZE]; + void * vals[GGML_GRAPH_HASHTABLE_SIZE]; +}; + +static struct hash_map * new_hash_map(void) { + struct hash_map * result = malloc(sizeof(struct hash_map)); + for (int i=0; ikeys[i] = NULL; + result->vals[i] = NULL; + } + return result; +} + +static void free_hash_map(struct hash_map * map) { + free(map); +} + +// gradient checkpointing + +static struct ggml_tensor * ggml_recompute_graph_node( + struct ggml_context * ctx, + struct ggml_cgraph * graph, + struct hash_map * replacements, + struct ggml_tensor * node) { + + if (node == NULL) { + return NULL; + } + + if (node->is_param) { + return node; + } + + if (!hash_contains(graph->visited_hash_table, node)) { + return node; + } + + int count_children = 0; + for (int k = 0; k < GGML_MAX_SRC; ++k) { + if (node->src[k]) { + ++count_children; + } + } + + if (count_children == 0) { + return node; + } + + size_t i = hash_find(replacements->keys, node); + GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full + if (replacements->keys[i] == node) { + return (struct ggml_tensor *) replacements->vals[i]; + } + + struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne); + + // insert clone into replacements + GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite + replacements->keys[i] = node; + replacements->vals[i] = clone; + + clone->op = node->op; + clone->grad = node->grad; + clone->is_param = node->is_param; + clone->extra = node->extra; + for (int k = 0; k < GGML_MAX_DIMS; ++k) { + clone->nb[k] = node->nb[k]; + } + for (int k = 0; k < GGML_MAX_SRC; ++k) { + clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]); + } + if (node->view_src != NULL) { + clone->data = (node->view_src->data == NULL) + ? NULL // view_src not yet allocated + : (char *) node->view_src->data // view_src already allocated + + node->view_offs; + clone->view_src = node->view_src; + clone->view_offs = node->view_offs; + } + + GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t))); + GGML_ASSERT(sizeof(node->name) == GGML_MAX_NAME); + memcpy(clone->op_params, node->op_params, sizeof(node->op_params)); + ggml_format_name(clone, "%s (clone)", ggml_get_name(node)); + + return clone; +} + +void ggml_build_backward_gradient_checkpointing( + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + struct ggml_cgraph * gb_tmp, + struct ggml_tensor * * checkpoints, + int n_checkpoints) { + *gb_tmp = *gf; + ggml_build_backward_expand(ctx, gf, gb_tmp, true); + + if (n_checkpoints <= 0) { + *gb = *gb_tmp; + return; + } + + struct hash_map * replacements = new_hash_map(); + + // insert checkpoints in replacements + for (int i = 0; i < n_checkpoints; ++i) { + size_t k = hash_find(replacements->keys, checkpoints[i]); + GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full + GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite + replacements->keys[k] = checkpoints[i]; + replacements->vals[k] = checkpoints[i]; + } + + *gb = *gf; + // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes], + // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]), + // by recomputing them from checkpoints + for (int i = gf->n_nodes; in_nodes; ++i) { + struct ggml_tensor * node = gb_tmp->nodes[i]; + for (int k = 0; k < GGML_MAX_SRC; ++k) { + // insert new tensors recomputing src, reusing already made replacements, + // remember replacements: remember new tensors with mapping from corresponding gf nodes + // recurse for input tensors, + // unless (i.e. terminating when) input tensors are replacments (like checkpoints) + node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]); + } + // insert rewritten backward node with replacements made into resulting backward graph gb + ggml_build_forward_expand(gb, node); + } + + free_hash_map(replacements); +} + +// functions to change gradients considering the case that input a might be initial gradient with zero value + +static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) { + if (hash_contains(zero_table, a)) { + return b; + } else { + return ggml_add_impl(ctx, a, b, false); + } +} + +static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, void * zero_table[]) { + if (hash_contains(zero_table, a)) { + struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0)); + return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false); + } else { + return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false); + } +} + +static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) { + if (hash_contains(zero_table, a)) { + return ggml_repeat(ctx, b, a); + } else { + return ggml_add1_impl(ctx, a, b, false); + } +} + +static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) { + if (hash_contains(zero_table, a)) { + return ggml_neg(ctx, b); + } else { + return ggml_sub_impl(ctx, a, b, false); + } +} + +static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, void * zero_table[]) { struct ggml_tensor * src0 = tensor->src[0]; struct ggml_tensor * src1 = tensor->src[1]; @@ -16017,34 +16747,34 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_OP_DUP: { if (src0->grad) { - src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); } } break; case GGML_OP_ADD: { if (src0->grad) { - src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); } if (src1->grad) { - src1->grad = ggml_add_impl(ctx, src1->grad, tensor->grad, inplace); + src1->grad = ggml_add_or_set(ctx, src1->grad, tensor->grad, zero_table); } } break; case GGML_OP_ADD1: { if (src0->grad) { - src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); } if (src1->grad) { - src1->grad = ggml_add_impl(ctx, + src1->grad = ggml_add_or_set(ctx, src1->grad, ggml_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean - inplace); + zero_table); } } break; case GGML_OP_ACC: { if (src0->grad) { - src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); } if (src1->grad) { const size_t nb1 = ((int32_t *) tensor->op_params)[0]; @@ -16061,117 +16791,117 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor nb1, nb2, nb3, offset); src1->grad = - ggml_add_impl(ctx, + ggml_add_or_set(ctx, src1->grad, ggml_reshape(ctx, ggml_cont(ctx, tensor_grad_view), src1->grad), - inplace); + zero_table); } } break; case GGML_OP_SUB: { if (src0->grad) { - src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); } if (src1->grad) { - src1->grad = ggml_sub_impl(ctx, src1->grad, tensor->grad, inplace); + src1->grad = ggml_sub_or_set(ctx, src1->grad, tensor->grad, zero_table); } } break; case GGML_OP_MUL: { if (src0->grad) { src0->grad = - ggml_add_impl(ctx, + ggml_add_or_set(ctx, src0->grad, ggml_mul(ctx, src1, tensor->grad), - inplace); + zero_table); } if (src1->grad) { src1->grad = - ggml_add_impl(ctx, + ggml_add_or_set(ctx, src1->grad, ggml_mul(ctx, src0, tensor->grad), - inplace); + zero_table); } } break; case GGML_OP_DIV: { if (src0->grad) { src0->grad = - ggml_add_impl(ctx, + ggml_add_or_set(ctx, src0->grad, ggml_div(ctx, tensor->grad, src1), - inplace); + zero_table); } if (src1->grad) { src1->grad = - ggml_sub_impl(ctx, + ggml_sub_or_set(ctx, src1->grad, ggml_mul(ctx, tensor->grad, ggml_div(ctx, tensor, src1)), - inplace); + zero_table); } } break; case GGML_OP_SQR: { if (src0->grad) { src0->grad = - ggml_add_impl(ctx, + ggml_add_or_set(ctx, src0->grad, ggml_scale(ctx, ggml_mul(ctx, src0, tensor->grad), ggml_new_f32(ctx, 2.0f)), - inplace); + zero_table); } } break; case GGML_OP_SQRT: { if (src0->grad) { src0->grad = - ggml_add_impl(ctx, + ggml_add_or_set(ctx, src0->grad, ggml_scale(ctx, ggml_div(ctx, tensor->grad, tensor), ggml_new_f32(ctx, 0.5f)), - inplace); + zero_table); } } break; case GGML_OP_LOG: { if (src0->grad) { src0->grad = - ggml_add_impl(ctx, + ggml_add_or_set(ctx, src0->grad, ggml_div(ctx, tensor->grad, src0), - inplace); + zero_table); } } break; case GGML_OP_SUM: { if (src0->grad) { src0->grad = - ggml_add1_impl(ctx, + ggml_add1_or_set(ctx, src0->grad, tensor->grad, - inplace); + zero_table); } } break; case GGML_OP_SUM_ROWS: { if (src0->grad) { src0->grad = - ggml_add_impl(ctx, + ggml_add_or_set(ctx, src0->grad, ggml_repeat(ctx, tensor->grad, src0->grad), - inplace); + zero_table); } } break; case GGML_OP_MEAN: @@ -16183,20 +16913,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - src0->grad = ggml_add_impl(ctx, + src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_repeat_back(ctx, tensor->grad, src0->grad), - inplace); + zero_table); } } break; case GGML_OP_REPEAT_BACK: { if (src0->grad) { // TODO: test this - src0->grad = ggml_add_impl(ctx, + src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_repeat(ctx, tensor->grad, src0->grad), - inplace); + zero_table); } } break; case GGML_OP_CONCAT: @@ -16218,10 +16948,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor float eps; memcpy(&eps, tensor->op_params, sizeof(float)); - src0->grad = ggml_add_impl(ctx, + src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_rms_norm_back(ctx, src0, tensor->grad, eps), - inplace); + zero_table); } } break; case GGML_OP_RMS_NORM_BACK: @@ -16245,37 +16975,49 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix // ds1 = t.T.dot(dt) - // tensor.shape [m,p] - // src0.shape [n,m] - // src1.shape [n,p] + // tensor.shape [m,p,qq,rr] + // src0.shape [n,m,q1,r1] + // src1.shape [n,p,qq,rr] // necessary for llama if (src0->grad) { + struct ggml_tensor * s1_tg = + ggml_out_prod(ctx, // [n,m,qq,rr] + src1, // [n,p,qq,rr] + tensor->grad); // [m,p,qq,rr] + const int64_t qq = s1_tg->ne[2]; + const int64_t rr = s1_tg->ne[3]; + const int64_t q1 = src0->ne[2]; + const int64_t r1 = src0->ne[3]; + const bool ne2_broadcasted = qq > q1; + const bool ne3_broadcasted = rr > r1; + if (ne2_broadcasted || ne3_broadcasted) { + // sum broadcast repetitions of s1_tg into shape of src0 + s1_tg = ggml_repeat_back(ctx, s1_tg, src0); + } src0->grad = - ggml_add_impl(ctx, - src0->grad, - ggml_out_prod(ctx, // [n,m] - src1, // [n,p] - tensor->grad), // [m,p] - inplace); + ggml_add_or_set(ctx, + src0->grad, // [n,m,q1,r1] + s1_tg, // [n,m,q1,r1] + zero_table); } if (src1->grad) { src1->grad = - ggml_add_impl(ctx, - src1->grad, - // ggml_mul_mat(ctx, // [n,p] - // ggml_cont(ctx, // [m,n] - // ggml_transpose(ctx, src0)), // [m,n] - // tensor->grad), // [m,p] + ggml_add_or_set(ctx, + src1->grad, // [n,p,qq,rr] + // ggml_mul_mat(ctx, // [n,p,qq,rr] + // ggml_cont(ctx, // [m,n,q1,r1] + // ggml_transpose(ctx, src0)), // [m,n,q1,r1] + // tensor->grad), // [m,p,qq,rr] // // when src0 is bigger than tensor->grad (this is mostly the case in llama), // // avoid transpose of src0, rather transpose smaller tensor->grad // // and then use ggml_out_prod - ggml_out_prod(ctx, // [n,p] - src0, // [n,m] - ggml_transpose(ctx, // [p,m] - tensor->grad)), // [m,p] - inplace); + ggml_out_prod(ctx, // [n,p,qq,rr] + src0, // [n,m,q1,r1] + ggml_transpose(ctx, // [p,m,qq,rr] + tensor->grad)), // [m,p,qq,rr] + zero_table); } } break; case GGML_OP_OUT_PROD: @@ -16287,17 +17029,17 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // necessary for llama if (src0->grad) { src0->grad = - ggml_add_impl(ctx, + ggml_add_or_set(ctx, src0->grad, ggml_scale_impl(ctx, tensor->grad, src1, false), - inplace); + zero_table); } if (src1->grad) { src1->grad = - ggml_add_impl(ctx, + ggml_add_or_set(ctx, src1->grad, ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)), - inplace); + zero_table); } } break; case GGML_OP_SET: @@ -16324,23 +17066,23 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } if (src0->grad) { - src0->grad = ggml_add_impl(ctx, + src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_acc_impl(ctx, tensor->grad, ggml_neg(ctx, tensor_grad_view), nb1, nb2, nb3, offset, false), - inplace); + zero_table); } if (src1->grad) { src1->grad = - ggml_add_impl(ctx, + ggml_add_or_set(ctx, src1->grad, ggml_reshape(ctx, ggml_cont(ctx, tensor_grad_view), src1->grad), - inplace); + zero_table); } } break; case GGML_OP_CPY: @@ -16351,7 +17093,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // tensor = src0 * 1 + src1 * 0 if (src0->grad) { // dsrc0 = dtensor * 1 - src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); } if (src1->grad) { // dsrc1 = dtensor * 0 -> noop @@ -16363,7 +17105,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor if (src0->grad) { GGML_ASSERT(ggml_is_contiguous(src0->grad)); GGML_ASSERT(ggml_is_contiguous(tensor->grad)); - src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); } } break; case GGML_OP_RESHAPE: @@ -16371,9 +17113,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // necessary for llama if (src0->grad) { src0->grad = - ggml_add_impl(ctx, src0->grad, - ggml_reshape(ctx, tensor->grad, src0->grad), - inplace); + ggml_add_or_set(ctx, src0->grad, + ggml_reshape(ctx, + ggml_is_contiguous(tensor->grad) + ? tensor->grad + : ggml_cont(ctx, tensor->grad), + src0->grad), + zero_table); } } break; case GGML_OP_VIEW: @@ -16402,7 +17148,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor nb3 = (nb3 / n0) * ng; } - src0->grad = ggml_acc_impl(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, inplace); + src0->grad = ggml_acc_or_set(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, zero_table); } } break; case GGML_OP_PERMUTE: @@ -16420,14 +17166,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor axes_backward[axis2] = 2; axes_backward[axis3] = 3; src0->grad = - ggml_add_impl(ctx, src0->grad, + ggml_add_or_set(ctx, src0->grad, ggml_permute(ctx, tensor->grad, axes_backward[0], axes_backward[1], axes_backward[2], axes_backward[3]), - inplace); + zero_table); } } break; case GGML_OP_TRANSPOSE: @@ -16435,9 +17181,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // necessary for llama if (src0->grad) { src0->grad = - ggml_add_impl(ctx, src0->grad, + ggml_add_or_set(ctx, src0->grad, ggml_transpose(ctx, tensor->grad), - inplace); + zero_table); } } break; case GGML_OP_GET_ROWS: @@ -16445,9 +17191,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // necessary for llama (only for tokenizer) if (src0->grad) { src0->grad = - ggml_add_impl(ctx, src0->grad, + ggml_add_or_set(ctx, src0->grad, + // last ggml_get_rows_back argument src0->grad is only + // necessary to setup correct output shape ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad), - inplace); + zero_table); } if (src1->grad) { // noop @@ -16467,9 +17215,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor if (src0->grad) { const int n_past = ((int32_t *) tensor->op_params)[0]; src0->grad = - ggml_add_impl(ctx, src0->grad, + ggml_add_or_set(ctx, src0->grad, ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), - inplace); + zero_table); } } break; case GGML_OP_DIAG_MASK_ZERO: @@ -16478,9 +17226,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor if (src0->grad) { const int n_past = ((int32_t *) tensor->op_params)[0]; src0->grad = - ggml_add_impl(ctx, src0->grad, + ggml_add_or_set(ctx, src0->grad, ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), - inplace); + zero_table); } } break; case GGML_OP_SOFT_MAX: @@ -16488,9 +17236,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // necessary for llama if (src0->grad) { src0->grad = - ggml_add_impl(ctx, src0->grad, + ggml_add_or_set(ctx, src0->grad, ggml_soft_max_back(ctx, tensor->grad, tensor), - inplace); + zero_table); } } break; @@ -16502,7 +17250,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - const int n_past = ((int32_t *) tensor->op_params)[0]; + //const int n_past = ((int32_t *) tensor->op_params)[0]; const int n_dims = ((int32_t *) tensor->op_params)[1]; const int mode = ((int32_t *) tensor->op_params)[2]; const int n_ctx = ((int32_t *) tensor->op_params)[3]; @@ -16515,11 +17263,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); - src0->grad = ggml_add_impl(ctx, + src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_rope_back(ctx, tensor->grad, - n_past, + src1, n_dims, mode, n_ctx, @@ -16527,13 +17275,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor freq_scale, xpos_base, xpos_down), - inplace); + zero_table); } } break; case GGML_OP_ROPE_BACK: { if (src0->grad) { - const int n_past = ((int32_t *) tensor->op_params)[0]; + //const int n_past = ((int32_t *) tensor->op_params)[0]; const int n_dims = ((int32_t *) tensor->op_params)[1]; const int mode = ((int32_t *) tensor->op_params)[2]; const int n_ctx = ((int32_t *) tensor->op_params)[3]; @@ -16546,11 +17294,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); - src0->grad = ggml_add_impl(ctx, + src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_rope_impl(ctx, tensor->grad, - n_past, + src1, n_dims, mode, n_ctx, @@ -16559,7 +17307,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor xpos_base, xpos_down, false), - inplace); + zero_table); } } break; case GGML_OP_ALIBI: @@ -16610,145 +17358,42 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor masked); } - if (src0->grad) { - struct ggml_tensor * grad_q = NULL; - const size_t nb0 = flash_grad->nb[0]; - const size_t offset = 0; - switch(src0->n_dims) { - case 2: - { - grad_q = ggml_view_2d(ctx, - flash_grad, - src0->ne[0], - src0->ne[1], - nb0*src0->ne[0], - offset); - } break; - case 3: - { - grad_q = ggml_view_3d(ctx, - flash_grad, - src0->ne[0], - src0->ne[1], - src0->ne[2], - nb0*src0->ne[0], - nb0*src0->ne[0]*src0->ne[1], - offset); - } break; - case 4: - { - grad_q = ggml_view_4d(ctx, - flash_grad, - src0->ne[0], - src0->ne[1], - src0->ne[2], - src0->ne[3], - nb0*src0->ne[0], - nb0*src0->ne[0]*src0->ne[1], - nb0*src0->ne[0]*src0->ne[1]*src0->ne[2], - offset); - } break; - } + struct ggml_tensor * src2 = tensor->src[2]; + const int64_t elem_q = ggml_nelements(src0); + const int64_t elem_k = ggml_nelements(src1); + const int64_t elem_v = ggml_nelements(src2); + + enum ggml_type result_type = flash_grad->type; + GGML_ASSERT(ggml_blck_size(result_type) == 1); + const size_t tsize = ggml_type_size(result_type); + + const size_t offs_q = 0; + const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN); + const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN); - src0->grad = ggml_add_impl(ctx, + if (src0->grad) { + struct ggml_tensor * view_q = ggml_view_1d(ctx, flash_grad, elem_q, offs_q); + struct ggml_tensor * grad_q = ggml_reshape(ctx, view_q, src0); + src0->grad = ggml_add_or_set(ctx, src0->grad, grad_q, - inplace); + zero_table); } - if (src1->grad) { - struct ggml_tensor * grad_k = NULL; - const size_t nb0 = flash_grad->nb[0]; - const size_t offset = nb0*src0->ne[0]*src0->ne[1]*src0->ne[2]*src0->ne[3]; - switch(src1->n_dims) { - case 2: - { - grad_k = ggml_view_2d(ctx, - flash_grad, - src1->ne[0], - src1->ne[1], - nb0*src1->ne[0], - offset); - } break; - case 3: - { - grad_k = ggml_view_3d(ctx, - flash_grad, - src1->ne[0], - src1->ne[1], - src1->ne[2], - nb0*src1->ne[0], - nb0*src1->ne[0]*src1->ne[1], - offset); - } break; - case 4: - { - grad_k = ggml_view_4d(ctx, - flash_grad, - src1->ne[0], - src1->ne[1], - src1->ne[2], - src1->ne[3], - nb0*src1->ne[0], - nb0*src1->ne[0]*src1->ne[1], - nb0*src1->ne[0]*src1->ne[1]*src1->ne[2], - offset); - } break; - } - - src1->grad = ggml_add_impl(ctx, + struct ggml_tensor * view_k = ggml_view_1d(ctx, flash_grad, elem_k, offs_k); + struct ggml_tensor * grad_k = ggml_reshape(ctx, view_k, src1); + src1->grad = ggml_add_or_set(ctx, src1->grad, grad_k, - inplace); + zero_table); } - - struct ggml_tensor * opt0 = tensor->src[2]; - - if (opt0->grad) { - struct ggml_tensor * grad_v = NULL; - const size_t nb0 = flash_grad->nb[0]; - const size_t offset = nb0*src0->ne[0]*src0->ne[1]*src0->ne[2]*src0->ne[3] - + nb0*src1->ne[0]*src1->ne[1]*src1->ne[2]*src1->ne[3]; - switch(opt0->n_dims) { - case 2: - { - grad_v = ggml_view_2d(ctx, - flash_grad, - opt0->ne[0], - opt0->ne[1], - nb0*opt0->ne[0], - offset); - } break; - case 3: - { - grad_v = ggml_view_3d(ctx, - flash_grad, - opt0->ne[0], - opt0->ne[1], - opt0->ne[2], - nb0*opt0->ne[0], - nb0*opt0->ne[0]*opt0->ne[1], - offset); - } break; - case 4: - { - grad_v = ggml_view_4d(ctx, - flash_grad, - opt0->ne[0], - opt0->ne[1], - opt0->ne[2], - opt0->ne[3], - nb0*opt0->ne[0], - nb0*opt0->ne[0]*opt0->ne[1], - nb0*opt0->ne[0]*opt0->ne[1]*opt0->ne[2], - offset); - } break; - } - - opt0->grad = ggml_add_impl(ctx, - opt0->grad, + if (src2->grad) { + struct ggml_tensor * view_v = ggml_view_1d(ctx, flash_grad, elem_v, offs_v); + struct ggml_tensor * grad_v = ggml_reshape(ctx, view_v, src2); + src2->grad = ggml_add_or_set(ctx, + src2->grad, grad_v, - inplace); + zero_table); } } break; case GGML_OP_FLASH_FF: @@ -16768,12 +17413,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { if (src0->grad) { src0->grad = - ggml_add_impl(ctx, + ggml_add_or_set(ctx, src0->grad, ggml_mul(ctx, ggml_sgn(ctx, src0), tensor->grad), - inplace); + zero_table); } } break; case GGML_UNARY_OP_SGN: @@ -16785,7 +17430,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_UNARY_OP_NEG: { if (src0->grad) { - src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_sub_or_set(ctx, src0->grad, tensor->grad, zero_table); } } break; case GGML_UNARY_OP_STEP: @@ -16805,12 +17450,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_UNARY_OP_RELU: { if (src0->grad) { - src0->grad = ggml_add_impl(ctx, + src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_mul(ctx, ggml_step(ctx, src0), tensor->grad), - inplace); + zero_table); } } break; case GGML_UNARY_OP_GELU: @@ -16825,10 +17470,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - src0->grad = ggml_add_impl(ctx, + src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_silu_back(ctx, src0, tensor->grad), - inplace); + zero_table); } } break; default: @@ -16851,13 +17496,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_OP_CROSS_ENTROPY_LOSS: { if (src0->grad) { - src0->grad = ggml_add_impl(ctx, + src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_cross_entropy_loss_back(ctx, src0, src1, tensor->grad), - inplace); + zero_table); } } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: @@ -16873,34 +17518,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor GGML_ASSERT(false); } break; } -} - -static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small"); - -static size_t hash(void * p) { - return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE; -} -static bool hash_insert(void * hash_table[], void * p) { - size_t h = hash(p); - - // linear probing - size_t i = h; - while (hash_table[i] != NULL && hash_table[i] != p) { - i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE; - if (i == h) { - // hash table is full - GGML_ASSERT(false); + for (int i = 0; i < GGML_MAX_SRC; ++i) { + if (tensor->src[i] && tensor->src[i]->grad) { + GGML_ASSERT(ggml_are_same_shape(tensor->src[i], tensor->src[i]->grad)); } } - - if (hash_table[i] == p) { - return true; - } - - // insert - hash_table[i] = p; - return false; } static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) { @@ -16918,8 +17541,12 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * } for (int i = 0; i < GGML_MAX_SRC; ++i) { - if (node->src[i]) { - ggml_visit_parents(cgraph, node->src[i]); + const int k = + (cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i : + (cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) : + /* unknown order, just fall back to using i*/ i; + if (node->src[k]) { + ggml_visit_parents(cgraph, node->src[k]); } } @@ -16978,6 +17605,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { /*.grads =*/ { NULL }, /*.leafs =*/ { NULL }, /*.hash_table =*/ { NULL }, + /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, @@ -17003,12 +17631,22 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * } } + // remember original gradients which start with zero values + void ** zero_table = malloc(sizeof(void *) * GGML_GRAPH_HASHTABLE_SIZE); + memset(zero_table, 0, sizeof(void*) * GGML_GRAPH_HASHTABLE_SIZE); + for (int i = 0; i < gf->n_nodes; i++) { + if (gf->grads[i]) { + hash_insert(zero_table, gf->grads[i]); + } + } + for (int i = gf->n_nodes - 1; i >= 0; i--) { struct ggml_tensor * node = gf->nodes[i]; - // because we detached the grad nodes from the original graph, we can afford inplace operations + // inplace operations to add gradients are not created by ggml_compute_backward + // use allocator to automatically make inplace operations if (node->grad) { - ggml_compute_backward(ctx, node, keep); + ggml_compute_backward(ctx, node, zero_table); } } @@ -17020,6 +17658,8 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * ggml_build_forward_expand(gb, node->grad); } } + + free(zero_table); } struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) { @@ -17039,6 +17679,7 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) { /*.grads =*/ { NULL }, /*.leafs =*/ { NULL }, /*.hash_table =*/ { NULL }, + /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, @@ -17425,7 +18066,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } break; case GGML_OP_CONCAT: case GGML_OP_MUL_MAT: - case GGML_OP_OUT_PROD: { n_tasks = n_threads; @@ -17467,6 +18107,18 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { cur = 0; } + work_size = MAX(work_size, cur); + } break; + case GGML_OP_OUT_PROD: + { + n_tasks = n_threads; + + size_t cur = 0; + + if (ggml_is_quantized(node->src[0]->type)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; + } + work_size = MAX(work_size, cur); } break; case GGML_OP_SCALE: @@ -18560,7 +19212,7 @@ static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * } static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) { - int i = 0; + int64_t i = 0; for (int p = 0; p < np; ++p) { const int64_t ne = ggml_nelements(ps[p]) ; // TODO: add function to get all elements at once @@ -18570,6 +19222,17 @@ static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g } } +static void ggml_opt_acc_grad(int np, struct ggml_tensor * const ps[], float * g, float scale) { + int64_t i = 0; + for (int p = 0; p < np; ++p) { + const int64_t ne = ggml_nelements(ps[p]) ; + // TODO: add function to get all elements at once + for (int64_t j = 0; j < ne; ++j) { + g[i++] += ggml_get_f32_1d(ps[p]->grad, j) * scale; + } + } +} + // // ADAM // @@ -18618,26 +19281,43 @@ static enum ggml_opt_result ggml_opt_adam( const float eps = params.adam.eps; const float gclip = params.adam.gclip; const int decay_min_ndim = params.adam.decay_min_ndim; + const int n_accum = MAX(1, params.n_gradient_accumulation); + const float accum_norm = 1.0f / (float) n_accum; + float * g = opt->adam.g->data; // gradients float * m = opt->adam.m->data; // first moment float * v = opt->adam.v->data; // second moment float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values - if (callback) { - callback(callback_data, &sched); - } - - // compute the function value - ggml_graph_reset (gf); - ggml_set_f32 (f->grad, 1.0f); - struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; - ggml_graph_compute(gb, &cplan); - opt->adam.fx_prev = ggml_get_f32_1d(f, 0); + bool cancel = false; + + // compute the function value + float fx = 0; + ggml_set_zero(opt->adam.g); + for (int accum_step = 0; accum_step < n_accum; ++accum_step) { + if (callback) { + callback(callback_data, accum_step, &sched, &cancel); + if (cancel) { + break; + } + } + // ggml_graph_reset (gf); + ggml_set_f32 (f->grad, 1.0f); + ggml_graph_compute(gb, &cplan); + ggml_opt_acc_grad(np, ps, g, accum_norm); + fx += ggml_get_f32_1d(f, 0); + } + if (cancel) { + return GGML_OPT_DID_NOT_CONVERGE; + } + fx *= accum_norm; + + opt->adam.fx_prev = fx; opt->adam.fx_best = opt->adam.fx_prev; if (pf) { pf[opt->iter % params.past] = opt->adam.fx_prev; @@ -18660,6 +19340,9 @@ static enum ggml_opt_result ggml_opt_adam( // run the optimizer for (int t = 0; t < params.adam.n_iter; ++t) { + if (cancel) { + break; + } opt->iter = iter0 + t + 1; GGML_PRINT_DEBUG ("=== iter %d ===\n", t); @@ -18682,12 +19365,8 @@ static enum ggml_opt_result ggml_opt_adam( if (gclip > 0.0f) { // gradient clipping ggml_float sum = 0.0; - for (int p = 0; p < np; ++p) { - const int64_t ne = ggml_nelements(ps[p]); - for (int64_t j = 0; j < ne; ++j) { - float g = ggml_get_f32_1d(ps[p]->grad, j); - sum += (ggml_float)(g*g); - } + for (int64_t i = 0; i < nx; ++i) { + sum += (ggml_float)(g[i]*g[i]); } ggml_float norm = sqrt(sum); if (norm > (ggml_float) gclip) { @@ -18701,10 +19380,10 @@ static enum ggml_opt_result ggml_opt_adam( const int64_t ne = ggml_nelements(ps[p]); const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched; for (int64_t j = 0; j < ne; ++j) { - float x = ggml_get_f32_1d(ps[p], j); - float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm; - m[i] = m[i]*beta1 + g*(1.0f - beta1); - v[i] = v[i]*beta2 + g*g*(1.0f - beta2); + float x = ggml_get_f32_1d(ps[p], j); + float g_ = g[i]*gnorm; + m[i] = m[i]*beta1 + g_*(1.0f - beta1); + v[i] = v[i]*beta2 + g_*g_*(1.0f - beta2); float mh = m[i]*beta1h; float vh = v[i]*beta2h; vh = sqrtf(vh) + eps; @@ -18715,16 +19394,26 @@ static enum ggml_opt_result ggml_opt_adam( } } - if (callback) { - callback(callback_data, &sched); + fx = 0; + ggml_set_zero(opt->adam.g); + for (int accum_step = 0; accum_step < n_accum; ++accum_step) { + if (callback) { + callback(callback_data, accum_step, &sched, &cancel); + if (cancel) { + break; + } + } + // ggml_graph_reset (gf); + ggml_set_f32 (f->grad, 1.0f); + ggml_graph_compute(gb, &cplan); + ggml_opt_acc_grad(np, ps, g, accum_norm); + fx += ggml_get_f32_1d(f, 0); } + if (cancel) { + break; + } + fx *= accum_norm; - ggml_graph_reset (gf); - ggml_set_f32 (f->grad, 1.0f); - - ggml_graph_compute(gb, &cplan); - - const float fx = ggml_get_f32_1d(f, 0); opt->loss_after = fx; @@ -18804,11 +19493,11 @@ static enum ggml_opt_result linesearch_backtracking( float * step, const float * xp, struct ggml_tensor * f, - struct ggml_cgraph * gf, struct ggml_cgraph * gb, struct ggml_cplan * cplan, const int np, struct ggml_tensor * ps[], + bool * cancel, ggml_opt_callback callback, void * callback_data) { int count = 0; @@ -18822,6 +19511,9 @@ static enum ggml_opt_result linesearch_backtracking( const float dec = 0.5f; const float inc = 2.1f; + const int n_accum = MAX(1, params->n_gradient_accumulation); + const float accum_norm = 1.0f / (float) n_accum; + if (*step <= 0.f) { return GGML_LINESEARCH_INVALID_PARAMETERS; } @@ -18838,13 +19530,7 @@ static enum ggml_opt_result linesearch_backtracking( finit = *fx; dgtest = params->lbfgs.ftol*dginit; - while (true) { - if (callback) { - // LBFG-S does not support learning rate -> ignore learning schedule - float sched = 0; - callback(callback_data, &sched); - } - + while (!*cancel) { ggml_vec_cpy_f32(nx, x, xp); ggml_vec_mad_f32(nx, x, d, *step); @@ -18852,14 +19538,28 @@ static enum ggml_opt_result linesearch_backtracking( { ggml_opt_set_params(np, ps, x); - ggml_graph_reset (gf); - ggml_set_f32 (f->grad, 1.0f); - - ggml_graph_compute(gb, cplan); - - ggml_opt_get_grad(np, ps, g); + *fx = 0; + memset(g, 0, sizeof(float)*nx); + for (int accum_step = 0; accum_step < n_accum; ++accum_step) { + if (callback) { + // LBFG-S does not support learning rate -> ignore learning schedule + float sched = 0; + callback(callback_data, accum_step, &sched, cancel); + if (*cancel) { + break; + } + } + // ggml_graph_reset (gf); + ggml_set_f32 (f->grad, 1.0f); + ggml_graph_compute(gb, cplan); + ggml_opt_acc_grad(np, ps, g, accum_norm); + *fx += ggml_get_f32_1d(f, 0); + } + if (*cancel) { + break; + } + *fx *= accum_norm; - *fx = ggml_get_f32_1d(f, 0); } ++count; @@ -18905,7 +19605,7 @@ static enum ggml_opt_result linesearch_backtracking( (*step) *= width; } - return GGML_LINESEARCH_FAIL; + GGML_UNREACHABLE(); } static enum ggml_opt_result ggml_opt_lbfgs( @@ -18960,6 +19660,9 @@ static enum ggml_opt_result ggml_opt_lbfgs( float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values + const int n_accum = MAX(1, params.n_gradient_accumulation); + const float accum_norm = 1.0f / (float) n_accum; + float fx = 0.0f; // cost function value float xnorm = 0.0f; // ||x|| float gnorm = 0.0f; // ||g|| @@ -18973,24 +19676,33 @@ static enum ggml_opt_result ggml_opt_lbfgs( float * lm_s = opt->lbfgs.lms->data; float * lm_y = opt->lbfgs.lmy->data; - if (callback) { - // LBFG-S does not support learning rate -> ignore learning schedule - float sched = 0; - callback(callback_data, &sched); - } + bool cancel = false; // evaluate the function value and its gradient { ggml_opt_set_params(np, ps, x); - ggml_graph_reset (gf); - ggml_set_f32 (f->grad, 1.0f); - - ggml_graph_compute(gb, &cplan); - - ggml_opt_get_grad(np, ps, g); - - fx = ggml_get_f32_1d(f, 0); + fx = 0; + memset(g, 0, sizeof(float)*nx); + for (int accum_step = 0; accum_step < n_accum; ++accum_step) { + if (callback) { + // LBFG-S does not support learning rate -> ignore learning schedule + float sched = 0; + callback(callback_data, accum_step, &sched, &cancel); + if (cancel) { + break; + } + } + // ggml_graph_reset (gf); + ggml_set_f32 (f->grad, 1.0f); + ggml_graph_compute(gb, &cplan); + ggml_opt_acc_grad(np, ps, g, accum_norm); + fx += ggml_get_f32_1d(f, 0); + } + if (cancel) { + return GGML_OPT_DID_NOT_CONVERGE; + } + fx *= accum_norm; opt->loss_before = fx; opt->loss_after = fx; @@ -19048,7 +19760,10 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_vec_cpy_f32(nx, xp, x); ggml_vec_cpy_f32(nx, gp, g); - ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data); + ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data); + if (!cancel) { + break; + } if (ls < 0) { // linesearch failed - go back to the previous point and return @@ -19157,7 +19872,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( step[0] = 1.0; } - return GGML_OPT_DID_NOT_CONVERGE; + GGML_UNREACHABLE(); } struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { @@ -19177,6 +19892,8 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { .print_forward_graph = true, .print_backward_graph = true, + .n_gradient_accumulation = 1, + .adam = { .n_iter = 10000, .sched = 1.000f, @@ -19205,6 +19922,8 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { .print_forward_graph = true, .print_backward_graph = true, + .n_gradient_accumulation = 1, + .lbfgs = { .m = 6, .n_iter = 100, @@ -19235,13 +19954,32 @@ GGML_API void ggml_opt_init( opt->iter = 0; opt->nx = nx; opt->just_initialized = true; + if (opt->ctx == NULL) { + struct ggml_init_params ctx_opt_params; + if (opt->params.type == GGML_OPT_ADAM) { + ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3; + if (opt->params.past > 0) { + ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past; + } + } else if (opt->params.type == GGML_OPT_LBFGS) { + ctx_opt_params.mem_size = GGML_MEM_ALIGN*9 + ggml_tensor_overhead()*9 + ggml_type_size(GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2); + if (opt->params.past > 0) { + ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past; + } + } + ctx_opt_params.mem_buffer = NULL; + ctx_opt_params.no_alloc = false; + + opt->ctx = ggml_init(ctx_opt_params); + } switch (opt->params.type) { case GGML_OPT_ADAM: { - opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); - opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.g = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); + opt->adam.m = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); + opt->adam.v = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); opt->adam.pf = params.past > 0 - ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past) + ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past) : NULL; ggml_set_zero(opt->adam.m); ggml_set_zero(opt->adam.v); @@ -19251,18 +19989,18 @@ GGML_API void ggml_opt_init( } break; case GGML_OPT_LBFGS: { - opt->lbfgs.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); - opt->lbfgs.xp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); - opt->lbfgs.g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); - opt->lbfgs.gp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); - opt->lbfgs.d = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.x = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); + opt->lbfgs.xp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); + opt->lbfgs.g = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); + opt->lbfgs.gp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); + opt->lbfgs.d = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); opt->lbfgs.pf = params.past > 0 - ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past) + ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past) : NULL; - opt->lbfgs.lmal = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.lbfgs.m); - opt->lbfgs.lmys = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.lbfgs.m); - opt->lbfgs.lms = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, params.lbfgs.m); - opt->lbfgs.lmy = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, params.lbfgs.m); + opt->lbfgs.lmal = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m); + opt->lbfgs.lmys = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m); + opt->lbfgs.lms = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m); + opt->lbfgs.lmy = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m); ggml_set_zero(opt->lbfgs.x); ggml_set_zero(opt->lbfgs.xp); ggml_set_zero(opt->lbfgs.g); @@ -19868,10 +20606,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p } break; case GGUF_TYPE_ARRAY: case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; - }; + } } break; case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); - }; + } if (!ok) { break; @@ -20147,78 +20885,94 @@ int gguf_find_key(const struct gguf_context * ctx, const char * key) { return keyfound; } -const char * gguf_get_key(const struct gguf_context * ctx, int i) { - return ctx->kv[i].key.data; +const char * gguf_get_key(const struct gguf_context * ctx, int key_id) { + return ctx->kv[key_id].key.data; } -enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int i) { - return ctx->kv[i].type; +enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) { + return ctx->kv[key_id].type; } -enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.arr.type; +enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) { + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); + return ctx->kv[key_id].value.arr.type; } -const void * gguf_get_arr_data(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.arr.data; +const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) { + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); + return ctx->kv[key_id].value.arr.data; } const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) { + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); struct gguf_kv * kv = &ctx->kv[key_id]; struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i]; return str->data; } -int gguf_get_arr_n(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.arr.n; +int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) { + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); + return ctx->kv[key_id].value.arr.n; } -uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.uint8; +uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) { + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8); + return ctx->kv[key_id].value.uint8; } -int8_t gguf_get_val_i8(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.int8; +int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) { + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8); + return ctx->kv[key_id].value.int8; } -uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.uint16; +uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) { + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16); + return ctx->kv[key_id].value.uint16; } -int16_t gguf_get_val_i16(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.int16; +int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) { + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16); + return ctx->kv[key_id].value.int16; } -uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.uint32; +uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) { + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32); + return ctx->kv[key_id].value.uint32; } -int32_t gguf_get_val_i32(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.int32; +int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) { + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32); + return ctx->kv[key_id].value.int32; } -float gguf_get_val_f32(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.float32; +float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) { + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32); + return ctx->kv[key_id].value.float32; } -uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.uint64; +uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) { + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64); + return ctx->kv[key_id].value.uint64; } -int64_t gguf_get_val_i64(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.int64; +int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) { + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64); + return ctx->kv[key_id].value.int64; } -double gguf_get_val_f64(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.float64; +double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) { + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64); + return ctx->kv[key_id].value.float64; } -bool gguf_get_val_bool(const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.bool_; +bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) { + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL); + return ctx->kv[key_id].value.bool_; } -const char * gguf_get_val_str (const struct gguf_context * ctx, int i) { - return ctx->kv[i].value.str.data; +const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) { + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING); + return ctx->kv[key_id].value.str.data; } int gguf_get_n_tensors(const struct gguf_context * ctx) { @@ -20583,10 +21337,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * } break; case GGUF_TYPE_ARRAY: case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; - }; + } } break; case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); - }; + } } // write tensor infos diff --git a/ggml.h b/ggml.h index f45456876da621d5beafdc2419d9d575a9fb02ff..460857fa4cbd1a74eab4ad053a3d3bf0d69f4ea3 100644 --- a/ggml.h +++ b/ggml.h @@ -214,8 +214,8 @@ #define GGML_QNT_VERSION_FACTOR 1000 // do not change this #define GGML_MAX_DIMS 4 -#define GGML_MAX_NODES 4096 -#define GGML_MAX_PARAMS 256 +#define GGML_MAX_NODES 16384 +#define GGML_MAX_PARAMS 1024 #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 6 #define GGML_MAX_NAME 64 @@ -248,6 +248,14 @@ } \ } while (0) +#ifndef NDEBUG +#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached") +#elif defined(__GNUC__) +#define GGML_UNREACHABLE() __builtin_unreachable() +#else +#define GGML_UNREACHABLE() ((void) 0) +#endif + // used to copy the number of elements and stride in bytes of tensors into local variables. // main purpose is to reduce code duplication and improve readability. // @@ -445,6 +453,12 @@ extern "C" { GGML_OBJECT_WORK_BUFFER }; + enum ggml_log_level { + GGML_LOG_LEVEL_ERROR = 2, + GGML_LOG_LEVEL_WARN = 3, + GGML_LOG_LEVEL_INFO = 4 + }; + // ggml object struct ggml_object { size_t offs; @@ -467,8 +481,8 @@ extern "C" { int n_dims; int64_t ne[GGML_MAX_DIMS]; // number of elements size_t nb[GGML_MAX_DIMS]; // stride in bytes: - // nb[0] = sizeof(type) - // nb[1] = nb[0] * ne[0] + padding + // nb[0] = ggml_type_size(type) + // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding // nb[i] = nb[i-1] * ne[i-1] // compute data @@ -520,7 +534,15 @@ extern "C" { // next prime after GGML_MAX_NODES // #define GGML_GRAPH_HASHTABLE_SIZE 4099 // next prime after GGML_MAX_NODES * 2 (nodes + leafs) - #define GGML_GRAPH_HASHTABLE_SIZE 8273 + // #define GGML_GRAPH_HASHTABLE_SIZE 8273 + // #define GGML_GRAPH_HASHTABLE_SIZE 16411 + #define GGML_GRAPH_HASHTABLE_SIZE 32771 + + enum ggml_cgraph_eval_order { + GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, + GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, + GGML_CGRAPH_EVAL_ORDER_COUNT + }; // computation graph struct ggml_cgraph { @@ -533,6 +555,8 @@ extern "C" { void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE]; + enum ggml_cgraph_eval_order order; + // performance int perf_runs; int64_t perf_cycles; @@ -680,12 +704,21 @@ extern "C" { GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); + // Converts a flat index into coordinates + GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3); + GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); + GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); + GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value); + GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); + GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); + GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); + GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); @@ -719,6 +752,12 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_add_cast( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + enum ggml_type type); + GGML_API struct ggml_tensor * ggml_add1( struct ggml_context * ctx, struct ggml_tensor * a, @@ -828,6 +867,7 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + // sums repetitions in a into shape of b GGML_API struct ggml_tensor * ggml_repeat_back( struct ggml_context * ctx, struct ggml_tensor * a, @@ -1049,7 +1089,6 @@ extern "C" { size_t nb1, size_t offset); - // a -> b, return view(b) GGML_API struct ggml_tensor * ggml_cpy( struct ggml_context * ctx, @@ -1072,6 +1111,33 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + // make contiguous, with new shape + GGML_API struct ggml_tensor * ggml_cont_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0); + + GGML_API struct ggml_tensor * ggml_cont_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1); + + GGML_API struct ggml_tensor * ggml_cont_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2); + + GGML_API struct ggml_tensor * ggml_cont_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3); + // return view(a), b specifies the new shape // TODO: when we start computing gradient, make a copy instead of view GGML_API struct ggml_tensor * ggml_reshape( @@ -1219,14 +1285,15 @@ extern "C" { struct ggml_tensor * b); // rotary position embedding - // if mode & 1 == 1, skip n_past elements + // if mode & 1 == 1, skip n_past elements (DEPRECATED) // if mode & 2 == 1, GPT-NeoX style // if mode & 4 == 1, ChatGLM style - // TODO: avoid creating a new tensor every time + // + // b is an int32 vector with size a->ne[2], it contains the positions GGML_API struct ggml_tensor * ggml_rope( struct ggml_context * ctx, struct ggml_tensor * a, - int n_past, + struct ggml_tensor * b, int n_dims, int mode, int n_ctx); @@ -1235,7 +1302,7 @@ extern "C" { GGML_API struct ggml_tensor * ggml_rope_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - int n_past, + struct ggml_tensor * b, int n_dims, int mode, int n_ctx); @@ -1244,7 +1311,7 @@ extern "C" { GGML_API struct ggml_tensor * ggml_rope_custom( struct ggml_context * ctx, struct ggml_tensor * a, - int n_past, + struct ggml_tensor * b, int n_dims, int mode, int n_ctx, @@ -1255,7 +1322,7 @@ extern "C" { GGML_API struct ggml_tensor * ggml_rope_custom_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - int n_past, + struct ggml_tensor * b, int n_dims, int mode, int n_ctx, @@ -1266,7 +1333,7 @@ extern "C" { GGML_API struct ggml_tensor * ggml_rope_xpos_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - int n_past, + struct ggml_tensor * b, int n_dims, float base, bool down); @@ -1276,7 +1343,7 @@ extern "C" { GGML_API struct ggml_tensor * ggml_rope_back( struct ggml_context * ctx, struct ggml_tensor * a, - int n_past, + struct ggml_tensor * b, int n_dims, int mode, int n_ctx, @@ -1656,6 +1723,16 @@ extern "C" { // dump the graph into a file using the dot format GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); + // build gradient checkpointing backward graph gb for gf using provided checkpoints + // gb_tmp will contain original backward graph with rewritten backward process nodes, + // but without the second forward pass nodes. + GGML_API void ggml_build_backward_gradient_checkpointing( + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + struct ggml_cgraph * gb_tmp, + struct ggml_tensor * * checkpoints, + int n_checkpoints); // // optimization // @@ -1690,7 +1767,8 @@ extern "C" { GGML_LINESEARCH_INVALID_PARAMETERS, }; - typedef void (*ggml_opt_callback)(void * data, float * sched); + typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel); + typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); // optimization parameters // @@ -1721,6 +1799,8 @@ extern "C" { bool print_forward_graph; bool print_backward_graph; + int n_gradient_accumulation; + // ADAM parameters struct { int n_iter; @@ -1766,6 +1846,7 @@ extern "C" { float loss_after; struct { + struct ggml_tensor * g; // current gradient struct ggml_tensor * m; // first moment struct ggml_tensor * v; // second moment struct ggml_tensor * pf; // past function values @@ -1882,26 +1963,26 @@ extern "C" { GGML_API int gguf_get_n_kv(const struct gguf_context * ctx); GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key); - GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i); - - GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i); - GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i); - - // results are undefined if the wrong type is used for the key - GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int i); - GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int i); - GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int i); - GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int i); - GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int i); - GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int i); - GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int i); - GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int i); - GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int i); - GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int i); - GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int i); - GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i); - GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int i); - GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i); + GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id); + + GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id); + GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id); + + // will abort if the wrong type is used for the key + GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id); + GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id); + GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id); + GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id); + GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id); + GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id); + GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id); + GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id); + GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id); + GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id); + GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id); + GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id); + GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id); + GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id); GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i); GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index e0e0dbcbbe840d00f888002245e0e2ee9f71ca35..598cf8e594aa8a460e6cc52cb968e0117651d9fe 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -32,7 +32,7 @@ KEY_GENERAL_URL = "general.url" KEY_GENERAL_DESCRIPTION = "general.description" KEY_GENERAL_LICENSE = "general.license" KEY_GENERAL_SOURCE_URL = "general.source.url" -KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository" +KEY_GENERAL_SOURCE_HF_REPO = "general.source.huggingface.repository" KEY_GENERAL_FILE_TYPE = "general.file_type" # LLM diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index fb0f205f85691cd47878432c11f6e28825038b4a..39820712fe97753b682eb23c7965a979e26db45d 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -44,6 +44,7 @@ std::vector generated_tokens; llama_grammar * grammar = nullptr; //currently used grammar grammar_parser::parse_state parsed_grammar; +static std::string current_grammar = ""; //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt) static FileFormat file_format = FileFormat::BADFORMAT; @@ -550,7 +551,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in file_format = in_file_format; n_threads = params.n_threads = inputs.threads; - n_blasthreads = inputs.blasthreads; + n_blasthreads = params.n_threads_batch = inputs.blasthreads; n_batch = params.n_batch = inputs.batch_size; modelname = params.model = inputs.model_filename; useSmartContext = inputs.use_smartcontext; @@ -562,7 +563,17 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in blasbatchsize = 8; } params.memory_f16 = inputs.f16_kv; - params.n_ctx = inputs.max_context_length; + + auto clamped_max_context_length = inputs.max_context_length; + + if(clamped_max_context_length>16384 && + file_format != FileFormat::GGUF_LLAMA && file_format!=FileFormat::GGUF_FALCON) + { + printf("Warning: Only GGUF models can use max context above 16k. Max context lowered to 16k.\n"); + clamped_max_context_length = 16384; + } + + params.n_ctx = clamped_max_context_length; neox_ctx_v2.hparams.n_ctx = neox_ctx_v3.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx @@ -594,7 +605,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in float factor = file_format_meta.n_ctx_train/2048; effectivenctx = effectivenctx/factor; } - rope_freq_base = (effectivenctx <= 2048 ? 10000.0f : (effectivenctx <= 3072 ? 26000.0f : (effectivenctx <= 4096 ? 32000.0f : (effectivenctx <= 6144 ? 54000.0f : (effectivenctx <= 8192 ? 82684.0f : (effectivenctx <= 12288 ? 140000.0f : 200000.0f)))))); + rope_freq_base = (effectivenctx <= 2048 ? 10000.0f : (effectivenctx <= 3072 ? 26000.0f : (effectivenctx <= 4096 ? 32000.0f : (effectivenctx <= 6144 ? 54000.0f : + (effectivenctx <= 8192 ? 82684.0f : (effectivenctx <= 12288 ? 140000.0f : (effectivenctx <= 16384 ? 200000.0f : (effectivenctx <= 24576 ? 320000.0f : 440000.0f)))))))); } @@ -633,7 +645,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in //newer format has bit unshuffling SetQuantsUnshuffled(file_format == FileFormat::GGJT_2); llama_v2_context_params llama_ctx_params_v2 = llama_v2_context_default_params(); - llama_ctx_params_v2.n_ctx = inputs.max_context_length; + llama_ctx_params_v2.n_ctx = clamped_max_context_length; //llama_ctx_params.n_parts = -1; llama_ctx_params_v2.seed = -1; llama_ctx_params_v2.f16_kv = inputs.f16_kv; @@ -683,7 +695,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in else if(file_format == FileFormat::GGJT_3) { llama_v3_context_params llama_ctx_params = llama_v3_context_default_params(); - llama_ctx_params.n_ctx = inputs.max_context_length; + llama_ctx_params.n_ctx = clamped_max_context_length; //llama_ctx_paran_parts = -1; llama_ctx_params.seed = -1; llama_ctx_params.f16_kv = inputs.f16_kv; @@ -753,25 +765,26 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } else if(file_format==FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) { + llama_model_params model_params = llama_model_default_params(); llama_context_params llama_ctx_params = llama_context_default_params(); - llama_ctx_params.n_ctx = inputs.max_context_length; + llama_ctx_params.n_ctx = clamped_max_context_length; //llama_ctx_paran_parts = -1; llama_ctx_params.seed = -1; llama_ctx_params.f16_kv = inputs.f16_kv; - llama_ctx_params.low_vram = inputs.low_vram; + //llama_ctx_params.low_vram = inputs.low_vram; llama_ctx_params.mul_mat_q = inputs.use_mmq; llama_ctx_params.logits_all = false; - llama_ctx_params.use_mmap = inputs.use_mmap; - llama_ctx_params.use_mlock = inputs.use_mlock; - llama_ctx_params.n_gpu_layers = inputs.gpulayers; + model_params.use_mmap = inputs.use_mmap; + model_params.use_mlock = inputs.use_mlock; + model_params.n_gpu_layers = inputs.gpulayers; #if defined(GGML_USE_CLBLAST) - if(file_format==FileFormat::GGUF_FALCON && llama_ctx_params.n_gpu_layers>0) + if(file_format==FileFormat::GGUF_FALCON && model_params.n_gpu_layers>0) { printf("\nGPU layer offload for GGUF FALCON on OpenCL is known to have issues, it has been set to 0.\n"); - llama_ctx_params.n_gpu_layers = 0; + model_params.n_gpu_layers = 0; } #endif - llama_ctx_params.main_gpu = cu_parseinfo_maindevice; + model_params.main_gpu = cu_parseinfo_maindevice; llama_ctx_params.rope_freq_base = rope_freq_base; llama_ctx_params.rope_freq_scale = rope_freq_scale; llama_ctx_params.n_batch = blasbatchsize; @@ -786,11 +799,12 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } if(!ts_all_zero) { - llama_ctx_params.tensor_split = inputs.tensor_split; + model_params.tensor_split = inputs.tensor_split; } #endif - llama_ctx_v4 = llama_init_from_file(modelname.c_str(), llama_ctx_params); + llama_model * llamamodel = llama_load_model_from_file(modelname.c_str(), model_params); + llama_ctx_v4 = llama_new_context_with_model(llamamodel, llama_ctx_params); if (llama_ctx_v4 == NULL) { @@ -809,6 +823,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in int err = llama_apply_lora_from_file(llama_ctx_v4, lora_filename.c_str(), + 1.0f, lora_base_arg, n_threads); if (err != 0) @@ -818,11 +833,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } } - n_vocab = llama_n_vocab(llama_ctx_v4); + n_vocab = llama_n_vocab(llamamodel); //determine mem per token - const std::vector tmp = {1, 2, 3, 4}; - auto er = llama_eval(llama_ctx_v4, tmp.data(), tmp.size(), 0, params.n_threads); + std::vector tmp = {1, 2, 3, 4}; + auto er = llama_eval(llama_ctx_v4, tmp.data(), tmp.size(), 0); if(er!=0) { printf("\nLLAMA EVAL returned nonzero!\n"); @@ -1261,13 +1276,27 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o params.n_ctx = inputs.max_context_length; params.n_batch = n_batch; params.n_threads = n_threads; + params.n_threads_batch = n_blasthreads; bool stream_sse = inputs.stream_sse; generation_finished = false; // Set current generation status generated_tokens.clear(); // New Generation, new tokens std::string grammarstr = inputs.grammar; - load_grammar(grammarstr); + bool grammar_retain_state = inputs.grammar_retain_state; + if(grammar_retain_state) + { + if(grammarstr=="" || current_grammar!=grammarstr) //if grammar is identical, retain state + { + load_grammar(grammarstr); + } + } + else + { + load_grammar(grammarstr); + } + current_grammar = grammarstr; + if (params.repeat_last_n < 1) { @@ -1337,10 +1366,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o if(!ggml_cpu_has_gpublas()) { params.n_threads = 1; //do not limit here anymore. + params.n_threads_batch = 1; } else { params.n_threads = n_blasthreads; + params.n_threads_batch = n_blasthreads; } } @@ -1454,7 +1485,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o ::utreplace(tmp, "\n", "\\n"); outstr += tmp; - outstr += "\n\n[Debug: Context Size = " + std::to_string(current_context_tokens.size()) + "]\n"; + outstr += "\n\n[Debug: n_past="+std::to_string(n_past)+" Context Size = " + std::to_string(current_context_tokens.size()) + "]\n"; tmp = ""; for (auto id : current_context_tokens) { @@ -1492,7 +1523,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } else if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) { - evalres = (llama_eval(llama_ctx_v4, embd.data(), embdsize, n_past, params.n_threads)==0); + evalres = (llama_eval(llama_ctx_v4, embd.data(), embdsize, n_past)==0); } else if(file_format==FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2) { diff --git a/klite.embd b/klite.embd index 9a4a31f6aa79215343e76a58b6cde84be7765eea..ccfedd89ac83b207c62978eb35a2705232e07107 100644 --- a/klite.embd +++ b/klite.embd @@ -6,7 +6,7 @@ It requires no dependencies, installation or setup. Just copy this single static HTML file anywhere and open it in a browser, or from a webserver. Please go to https://github.com/LostRuins/lite.koboldai.net for updates on Kobold Lite. Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp. Please do not remove this line. -Current version: 68 +Current version: 74 -Concedo --> @@ -194,6 +194,9 @@ Current version: 68 padding: 10px; overflow-y: auto; } + .txtchunk, ai_context_koboldlite_internal, #gametext, .chat_received_withd_msg p,.chat_sent_msg p { + white-space: pre-wrap; + } #actionmenu { margin-top: 6px; @@ -2107,7 +2110,7 @@ Current version: 68 "prefmodel1":instructmodels1, "prefmodel2":instructmodels2, "prompt":"Welcome to your InteracTV, your interactive TV of the future today!\nPlease enter what you would like to watch:", - "memory": instructstartplaceholder+"\nSimulate an interactive TV that will let the user watch anything they want to watch.\n\nFirst, generate a single response prompting the user for input on what they wish to watch using the following response:\n```\nPlease enter your desired content:\n```\n\nAfter the user has entered the desired content generate the following table:\n- TV Show / Movie Name: Name of the show\n- Genre: Genre of the show\n- Program Description: Description of what the program is about, this can be any known or unkown TV or movie format.\n- Episode Name: Name of the episode\n- Episode Description: Description of what the episode is about.\n\nAfter generating this table promp the user if they wish to watch the episode with the following response and then end your generation:\n```\nDo you wish to watch this episode? (Y/N/Menu)\n"+instructstartplaceholder+"```\n\nIf the user chooses not to watch the episode generate a new episode with their requested content.\nIf the user chooses to go to the Menu ask them again what they would like to watch.\n\nIf the user chooses to watch the episode begin generating a long multiple paragraph detailed story based on the episode description, make it exciting and fun.\n\nEnd your response after each question presented to the user so that the user has a chance to respond.\n\nMain menu:\n```\nMenu Options\nA) Input a different content request\nB) Generate a different episode of the same content.\n"+instructstartplaceholder+"```\n"+instructendplaceholder, + "memory": instructstartplaceholder+"\nSimulate an interactive TV that will let the user watch anything they want to watch.\n\nFirst, generate a single response prompting the user for input on what they wish to watch using the following response:\n```\nPlease enter your desired content:\n```\n\nAfter the user has entered the desired content generate the following table:\n- TV Show / Movie Name: Name of the show\n- Genre: Genre of the show\n- Program Description: Description of what the program is about, this can be any known or unkown TV or movie format.\n- Episode Name: Name of the episode\n- Episode Description: Description of what the episode is about.\n\nAfter generating this table promp the user if they wish to watch the episode with the following response and then end your generation:\n```\nDo you wish to watch this episode? (Y/N/Menu)\n"+instructstartplaceholder+"```\nIf the user chooses to watch the episode begin generating a long detailed text based on the episode description containing character dialogue, make it exciting and fun written in the style of a book.\nThe text must contain dialogue in a he said she said format and is as lengthy as a book.\n\nIf the user chooses not to watch the episode generate a new episode with their requested content.\nIf the user chooses to go to the Menu ask them again what they would like to watch.\n\nEnd your response after each question presented to the user so that the user has a chance to respond.\n\nMain menu:\n```\nMenu Options\nA) Input a different content request\nB) Generate a different episode of the same content.\n"+instructstartplaceholder+"```\n"+instructendplaceholder, "authorsnote": "", "worldinfo": [] }, @@ -2170,6 +2173,21 @@ Current version: 68 "memory":`[Character: Erica; species: Human; age: 22; gender: female; job: unemployed, NEET; physical appearance: unkempt, tired; personality: insecure, extremely shy, anxious, lovesick, slightly depressed, awkward, easily embarrassed; likes: fantasy, reading trashy romance, browsing internet, being indoors; description: Erica is a socially awkward NEET girl who spends most of her time in front of the computer. She's a good person at heart, but she's very shy, anxious, and terrible at conversations.]\n[The following is a chat message log between Erica and you.]\nErica: *mumbles to herself, fidgeting nervously*...\n`, "authorsnote": "", "worldinfo": [] + }, + { + "title":"Nail the Kobold", + "author":"Concedo / TheGantian", + "desc":"Nail is a small red kobold on a big mission to find a powerful sorceror.", + "opmode":3, + "chatname": "You", + "chatopponent": "Nail", + "gui_type":1, + "prefmodel1":chatmodels1, + "prefmodel2":chatmodels2, + "prompt":`\nNail: *A small kobold dressed in a ragged cloak approaches you. She has a strange curved blade that seems too large for her hands.* "Excuse me, friend. My name is Nail. I have come a long way, looking for someone important... a powerful sorcerer named Rath Cinderstorm. Have you heard of him in your travels?"`, + "memory":`[Character: Nail; species: Redscale Kobold; age: 20; gender: female; class: Hexblade Warlock with powers derived from draconic patron; physical appearance: 3' in height, 35 lbs, purple eyes, pink scales and peachy chest; equipment: Dragon's talon affixed to a handle as a blade; personality: lawful neutral; description: Nail (called Nannan in her native tongue) is a refugee of the once-proud Xabrakkar kobolds on the continent of Halkar. Founded above a series of geothermal caves, her tribe prospered as they dug into long-buried ruins for priceless treasures, which they brought to the surface. Amongst the ruins, Nail discovered the slumbering red dragon Rhindicar - once the familiar to one of the most powerful sorcerers to ever live. The sleeping dragon quickly became an object of worship for the Xabrakkar kobolds. However, the Trobian relics they unearthed attracted the attention of another - Hilezmaras, the mad tyrant, a covetous dragon who laid claim to the kobolds treasures, sending his fanatical dragonborn cult to purge their warren. While most of the kobolds were slain, a select few were dragon-marked, forcibly given a magic brand linking them to the mad dragon in order to turn them into powerful and obedient soldiers. Nail broke free of her captors after being given such a mark, fleeing into the tunnels leading to the Tinder Depths, eventually collapsing before Rhindicar and waking him from his slumber. Being raised from a hatchling by a kind and just master, Rhindicar was uncharacteristically compassionate for a dragon, and took pity on the young kobold. Though he was not powerful enough to remove Hilezmaras' brand, he was able to suppress its magical compulsion, allowing her to retain her free-will. He warned, though, that as the dragon-mark grew in power and became more strongly linked to the mad tyrant, he would no longer be able to keep it suppressed, and urged Nannan to seek out his former master, Rath Cinderstorm. Biting off a fragment of one of his talons, he gifted it to the kobold, both as a weapon, and as a conduit to help him suppress the effects of the brand. With no other options, Nannan returned to the warren and fought her way to the surface, eventually escaping Halkar and crossing the ocean to Fanne'Tar, where she assumed the alias 'Nail' in Common tongue and began her search for a long-missing sorcerer.]\n[The following is a chat message log between Nail and you.]\n`, + "authorsnote": "", + "worldinfo": [] } ]; @@ -2239,10 +2257,22 @@ Current version: 68 } return str.replace(new RegExp(escapeRegExp(find), (caseInsensitive?'gi':'g')), replace); + } - + function rgbToHex(rgbColor) { + rgbColor = rgbColor.split("(")[1]; + rgbColor = rgbColor.split(")")[0]; + const components = rgbColor.split(','); + const red = parseInt(components[0]); + const green = parseInt(components[1]); + const blue = parseInt(components[2]); + const redHex = red.toString(16).padStart(2, '0'); + const greenHex = green.toString(16).padStart(2, '0'); + const blueHex = blue.toString(16).padStart(2, '0'); + return `#${redHex}${greenHex}${blueHex}`; } + function GetUniqueColor(idx) { switch(idx) @@ -2837,6 +2867,7 @@ Current version: 68 const default_oai_base = "https://api.openai.com"; const default_claude_base = "https://api.anthropic.com"; + const default_palm_base = "https://generativelanguage.googleapis.com/v1beta2/models/text-bison-001:generateText?key="; //support for quick news updates const news_endpoint = "https://news.concedo.workers.dev" @@ -2864,8 +2895,10 @@ Current version: 68 var pending_response_horde = text_hordes[0]; //the url to poll for pending response from a v2 submit var poll_in_progress = false; //are we currently waiting for a text generation var poll_ticks_passed = 0; //how much time passed after polling + var horde_poll_nearly_completed = false; //if true, increase polling rate var prev_hl_chunk = null; //will store the last highlighted element var pending_context_preinjection = ""; //this will be injected before the AI's next RESPONSE + var last_reply_was_empty = false; //set to true if last reply is empty var current_memory = ""; //stored memory var current_anote = ""; //stored author note var current_anotetemplate = "[Author\'s note: <|>]"; @@ -2885,6 +2918,7 @@ Current version: 68 var custom_oai_key = ""; //if set, uses the OpenAI API to generate var custom_oai_model = ""; var custom_scale_key = ""; + var custom_palm_key = ""; var custom_scale_ID = ""; var custom_claude_endpoint = default_claude_base; var custom_claude_key = ""; @@ -2903,6 +2937,7 @@ Current version: 68 var kobold_endpoint_version = ""; //used to track problematic versions to avoid sending extra fields var koboldcpp_version = ""; //detect if we are using koboldcpp var last_request_str = "No Requests Available"; //full context of last submitted request + var lastcheckgenkey = ""; //for checking polled-streaming unique id when generating in kcpp var localsettings = { my_api_key: "0000000000", //put here so it can be saved and loaded in persistent mode @@ -2912,11 +2947,12 @@ Current version: 68 saved_claude_key: "", //do not ever share this in save files! saved_claude_addr: "", //do not ever share this in save files! saved_oai_jailbreak: "", //customized oai system prompt + saved_palm_key: "", autoscroll: true, //automatically scroll to bottom on render trimsentences: true, //trim to last punctuation trimwhitespace: false, //trim trailing whitespace - unban_tokens: false, //allow the EOS token when using locally + eos_ban_mode: 0, //allow the EOS token when using locally 0=auto,1=unban,2=ban opmode: 1, //what mode are we in? 1=story, 2=adventure, 3=chat, 4=instruct adventure_is_action: false, //in adventure mode, determine story or action adventure_context_mod: true, //extra injection for adventure mode @@ -2925,12 +2961,13 @@ Current version: 68 instruct_starttag: "\\n### Instruction:\\n", instruct_endtag: "\\n### Response:\\n", instruct_has_markdown: false, - raw_instruct_tags: false, + placeholder_tags: true, persist_session: true, speech_synth: 0, //0 is disabled beep_on: false, image_styles: "", grammar:"", + tokenstreaming: (localflag?true:false), generate_images: (localflag?"":"stable_diffusion"), //"" is disabled and "*" is all, anything else is the model name pulled from stable horde img_autogen: false, img_allownsfw: true, @@ -2940,12 +2977,13 @@ Current version: 68 last_selected_preset: 0, gui_type_chat: 1, //0=standard, 1=messenger, 2=aesthetic gui_type_instruct: 0, //0=standard, 1=messenger, 2=aesthetic - multiline_replies: false, + multiline_replies: true, allow_continue_chat: false, idle_responses: 0, idle_duration: 60, export_settings: true, //affects if settings are included with the story and sharelinks invert_colors: false, + passed_ai_warning: false, //used to store AI safety panel acknowledgement state max_context_length: 1024, max_length: 100, @@ -3069,12 +3107,6 @@ Current version: 68 } } - const tokenstreaming = urlParams.get('streaming'); - if(tokenstreaming) - { - document.getElementById("tokenstreaming").checked = true; - } - const fromfile = ( window.location.protocol == 'file:' ); if(!dbgmode && !fromfile){ if(!window.console) window.console = {}; @@ -3109,6 +3141,12 @@ Current version: 68 console.log("Discarded invalid local save: " + e); } + const tokenstreaming = urlParams.get('streaming'); + if(tokenstreaming) + { + localsettings.tokenstreaming = true; + } + //toggle genimg btn if (localsettings.generate_images) { document.getElementById("btn_genimg").classList.remove("hidden"); @@ -3428,7 +3466,7 @@ Current version: 68 function is_using_custom_ep() { - return (custom_oai_key!=""||custom_kobold_endpoint!=""||custom_scale_key!=""||custom_claude_key!=""); + return (custom_oai_key!=""||custom_kobold_endpoint!=""||custom_scale_key!=""||custom_claude_key!=""||custom_palm_key!=""); } function is_using_kcpp_with_streaming() @@ -3447,7 +3485,7 @@ Current version: 68 //0 is none, 1 is pseudostreaming, 2 is true streaming function determine_streaming_type() { - let streamtype = (document.getElementById("tokenstreaming").checked ? 1 : 0); + let streamtype = (localsettings.tokenstreaming ? 1 : 0); let pstreamamount = urlParams.get('streamamount'); if(streamtype==1 && is_using_kcpp_with_streaming() && (pstreamamount == null || pstreamamount <= 0)) { @@ -3461,6 +3499,27 @@ Current version: 68 return streamtype; } + function determine_if_ban_eos(input_was_empty) { + if (localsettings.eos_ban_mode == 0) { + if (localsettings.opmode == 1) { + return true; //story mode always ban + } + else if (localsettings.opmode == 3 && !localsettings.allow_continue_chat) { + return false; //chat mode always unban unless cont allowed + } + else if (!input_was_empty) //if user input is not empty, ALWAYS unban EOS. + { + return false; + } + else { + return last_reply_was_empty; + } + } + else { + return (localsettings.eos_ban_mode == 2 ? true : false); + } + } + function is_using_web_lite() { return (window.location.hostname.includes("koboldai.net") || window.location.hostname.includes("kaihordewebui.github.io")); @@ -3641,6 +3700,7 @@ Current version: 68 let tmp_oai2 = localsettings.saved_oai_addr; let tmp_claude1 = localsettings.saved_claude_key; let tmp_claude2 = localsettings.saved_claude_addr; + let tmp_palm1 = localsettings.saved_palm_key; import_props_into_object(localsettings, story.savedsettings); localsettings.my_api_key = tmpapikey1; localsettings.home_cluster = tmphc; @@ -3648,6 +3708,7 @@ Current version: 68 localsettings.saved_oai_addr = tmp_oai2; localsettings.saved_claude_key = tmp_claude1; localsettings.saved_claude_addr = tmp_claude2; + localsettings.saved_palm_key = tmp_palm1; } if (story.savedaestheticsettings && story.savedaestheticsettings != "") { @@ -3956,6 +4017,7 @@ Current version: 68 let tmp_oai2 = localsettings.saved_oai_addr; let tmp_claude1 = localsettings.saved_claude_key; let tmp_claude2 = localsettings.saved_claude_addr; + let tmp_palm1 = localsettings.saved_palm_key; import_props_into_object(localsettings, loaded_storyobj.savedsettings); localsettings.my_api_key = tmpapikey1; localsettings.home_cluster = tmphc; @@ -3963,6 +4025,7 @@ Current version: 68 localsettings.saved_oai_addr = tmp_oai2; localsettings.saved_claude_key = tmp_claude1; localsettings.saved_claude_addr = tmp_claude2; + localsettings.saved_palm_key = tmp_palm1; //backwards compat support for newlines if(localsettings.instruct_has_newlines==true || (loaded_storyobj.savedsettings != null && loaded_storyobj.savedsettings.instruct_has_newlines==null&&loaded_storyobj.savedsettings.instruct_has_markdown==null)) @@ -4027,6 +4090,25 @@ Current version: 68 render_gametext(); } + function load_agnai_wi(obj,chatopponent,myname) + { + console.log("Append Agnai WI"); + current_wi = []; + for (let key in obj.entries) { + var itm = obj.entries[key]; + var karr = itm.keywords; + let nwi = { + "key": karr.join(","), + "keysecondary": "", + "content": itm.entry, + "comment": "", + "folder": null, + "selective": false, + "constant": false + }; + current_wi.push(nwi); + } + } function load_tavern_wi(obj,chatopponent,myname) { console.log("Append Tavern WI"); @@ -4045,7 +4127,7 @@ Current version: 68 } let nwi = { "key": karr.join(","), - "keysecondary": (ksarr.length > 0 ? ksarr.join(",") : ""), + "keysecondary": ((ksarr && ksarr.length) > 0 ? ksarr.join(",") : ""), "content": itm.content, "comment": itm.comment, "folder": null, @@ -4080,11 +4162,16 @@ Current version: 68 examplemsg = "\n"+examplemsg; } let combinedmem = memory + scenario + examplemsg; + let agnaidatafieldsempty = scenario + examplemsg + (obj.personality?obj.personality:"") + greeting; //check if it's a world info only card, if so, do not restart game if(combinedmem.trim()=="" && greeting=="" && obj.entries) { load_tavern_wi(obj,chatopponent,myname); } + else if(agnaidatafieldsempty.trim()=="" && obj.entries && obj.kind=="memory") + { + load_agnai_wi(obj,chatopponent,myname); + } else { restart_new_game(); @@ -4099,6 +4186,10 @@ Current version: 68 { load_tavern_wi(obj.character_book,chatopponent,myname); } + else if(obj.entries && obj.entries.length>0) + { + load_agnai_wi(obj,chatopponent,myname); + } } render_gametext(); } @@ -4229,8 +4320,8 @@ Current version: 68 let cdef = data.definition?data.definition.replace("END_OF_DIALOG","").trim():""; let cdesc = data.description?data.description:""; let greeting = data.greeting?data.greeting:""; - let previewtxt = replaceAll(cdesc,"{{char}}",botname); - previewtxt = replaceAll(previewtxt,"{{user}}","You"); + let previewtxt = replaceAll(cdesc,"{{char}}",botname,true); + previewtxt = replaceAll(previewtxt,"{{user}}","You",true); temp_scenario = { "title":data.title?data.title:"", @@ -4285,13 +4376,26 @@ Current version: 68 //load contexts gametext_arr = []; if (temp_scenario.prompt != "") { - gametext_arr.push(temp_scenario.prompt); + let prompttxt = temp_scenario.prompt; + if(!localsettings.placeholder_tags) //do a one-time replace instead + { + prompttxt = replace_placeholders_direct(prompttxt); + } + gametext_arr.push(prompttxt); } if (temp_scenario.authorsnote != "") { current_anote = temp_scenario.authorsnote; + if(!localsettings.placeholder_tags) + { + current_anote = replace_placeholders_direct(current_anote); + } } if (temp_scenario.memory != "") { current_memory = temp_scenario.memory; + if(!localsettings.placeholder_tags) + { + current_memory = replace_placeholders_direct(current_memory); + } } if (temp_scenario.worldinfo && temp_scenario.worldinfo.length > 0) { current_wi = []; @@ -4357,8 +4461,6 @@ Current version: 68 if (temp_scenario.instruct_endtag) { localsettings.instruct_endtag = temp_scenario.instruct_endtag; } } - - render_gametext(); } function togglescenarioallownsfw() @@ -4382,13 +4484,13 @@ Current version: 68 } function confirm_scenario_verify() { - if(temp_scenario.show_warning==true) + if(temp_scenario.show_warning==true && localsettings.passed_ai_warning==false) { let warntxt = `

Disclaimer: The AI is not suitable to be used as an actual therapist, counselor or advisor of any kind.

While some find it comforting to talk about their issues with an AI, the responses are unpredictable.

When using the AI for real world use-cases such as advice or counseling this means you must be able to understand when an answer is wrong. If you would not trust a random person to pretend to be your advisor; you should definitely not use the AI for this. The models are simply too small and not trained for this purpose.

-

If you still wish to proceed, please type the phrase I understand in the box below, exactly as written.

+

If you still wish to proceed, please type the phrase "I understand" in the box below, exactly as written.

If you are experiencing feelings of distress, anxiety, suicidal thoughts, or other forms of mental discomfort, it's best to avoid using AI for non fiction or personal matters as it may exacerbate or encourage these feelings.

`; inputBox(warntxt,"AI Safety Warning","","Acknowledgement Required",()=>{ @@ -4396,6 +4498,7 @@ Current version: 68 if(userinput.includes("understand")) { confirm_scenario(); + localsettings.passed_ai_warning = true; //remember flag for session } },true); } else { @@ -4551,12 +4654,15 @@ Current version: 68 msgbox(last_request_str,"Last Request Context",false); } + var worker_data_showonly = []; //only for table display, dont mix function get_and_show_workers() { - if(localflag) - { + if (localflag) { return; } - get_workers((wdata) => { show_workers(wdata) }); + get_workers((wdata) => { + worker_data_showonly = wdata; + show_workers(); + }); } function get_workers(onDoneCallback) { if(localflag) @@ -4599,12 +4705,42 @@ Current version: 68 } - function show_workers(wdata) { + function worker_list_quick_search() + { + if(document.getElementById("workertable").innerHTML!="") + { + let searchstr = document.getElementById("workerlistquicksearch").value.toLowerCase(); + for (var i = 0; i < worker_data_showonly.length; ++i) { + let elem = worker_data_showonly[i]; + let tablerow = document.getElementById("workertablerow_"+i); + if(tablerow) + { + if(searchstr=="" || elem.name.toLowerCase().includes(searchstr) || + elem.models[0].toLowerCase().includes(searchstr)) + { + tablerow.style.display = ""; + }else{ + tablerow.style.display = "none"; + } + } + } + } + } + + function format_uptime(seconds) + { + const days = Math.floor(seconds / (3600 * 24)); + const hours = Math.floor((seconds % (3600 * 24)) / 3600); + const minutes = Math.floor((seconds % 3600) / 60); + return days+"d "+hours+"h "+minutes+"m"; + } + + function show_workers() { document.getElementById("workercontainer").classList.remove("hidden"); let str = ""; - for (var i = 0; i < wdata.length; ++i) { - let elem = wdata[i]; + for (var i = 0; i < worker_data_showonly.length; ++i) { + let elem = worker_data_showonly[i]; let tokenspersec = elem.performance.replace(" tokens per second", ""); if(tokenspersec.toLowerCase()=="no requests fulfilled yet") { @@ -4619,10 +4755,16 @@ Current version: 68 { workerNameHtml = ""+workerNameHtml+""; } - str += "" + workerNameHtml + "" + escapeHtml(elem.models[0].substring(0, 32)) + "" + elem.max_length + " / " + elem.max_context_length + "
(" + tokenspersec + " T/s)" + elem.uptime + "
(" + elem.requests_fulfilled + " jobs)" + elem.kudos_rewards.toFixed(0) + ""+clustertag+""; + let allmdls = ""; + for (let n = 0; n < elem.models.length; ++n) { + if (n > 0) { allmdls += "
"; } + allmdls += escapeHtml(elem.models[n].substring(0, 32)); + } + + str += "" + workerNameHtml + "" + allmdls + "" + elem.max_length + " / " + elem.max_context_length + "
(" + tokenspersec + " T/s)" + format_uptime(elem.uptime) + "
(" + elem.requests_fulfilled + " jobs)" + elem.kudos_rewards.toFixed(0) + ""+clustertag+""; } document.getElementById("workertable").innerHTML = str; - document.getElementById("worktitlecount").innerText = "Worker List - Total "+wdata.length; + document.getElementById("worktitlecount").innerText = "Worker List - Total " + worker_data_showonly.length; } function show_my_own_workers() @@ -4647,7 +4789,7 @@ Current version: 68 let brokenstyle = (elem.maintenance_mode ? "style=\"color:#ee4444;\"" : ""); let workerNameHtml = escapeHtml(elem.name.substring(0, 32)); let eleminfo = ((elem.info && elem.info!="")?elem.info:""); - str += "" + workerNameHtml + "" + elem.uptime + "
(" + elem.requests_fulfilled + " jobs)" + elem.kudos_rewards.toFixed(0) + ""+(elem.online?"Online":"Offline")+""; + str += "" + workerNameHtml + "" + format_uptime(elem.uptime) + "
(" + elem.requests_fulfilled + " jobs)" + elem.kudos_rewards.toFixed(0) + ""+(elem.online?"Online":"Offline")+""; } document.getElementById("myownworkertable").innerHTML = str; @@ -4847,6 +4989,7 @@ Current version: 68 document.getElementById("koboldcustom").classList.add("hidden"); document.getElementById("scalecustom").classList.add("hidden"); document.getElementById("claudecustom").classList.add("hidden"); + document.getElementById("palmcustom").classList.add("hidden"); if(epchoice==0) { document.getElementById("koboldcustom").classList.remove("hidden"); @@ -4883,6 +5026,14 @@ Current version: 68 } } } + else if(epchoice==4) + { + document.getElementById("palmcustom").classList.remove("hidden"); + if(custom_palm_key=="" && localsettings.saved_palm_key!="") + { + document.getElementById("custom_palm_key").value = localsettings.saved_palm_key; + } + } } @@ -4892,6 +5043,7 @@ Current version: 68 custom_oai_key = ""; custom_scale_key = ""; custom_claude_key = ""; + custom_palm_key = ""; let epchoice = document.getElementById("customapidropdown").value; if(epchoice==0) //connect to kobold endpoint @@ -5064,8 +5216,11 @@ Current version: 68 { desired_oai_ep = desired_oai_ep.slice(0, -1); } - if(desired_oai_ep!="" && desired_oai_ep.length > 4 && !desired_oai_ep.slice(-4).toLowerCase().includes("/v")) { - desired_oai_ep = desired_oai_ep + "/v1"; + if (document.getElementById("oaiaddversion").checked) + { + if(desired_oai_ep!="" && desired_oai_ep.length > 4 && !desired_oai_ep.slice(-4).toLowerCase().includes("/v")) { + desired_oai_ep = desired_oai_ep + "/v1"; + } } if(desired_oai_key!="" && desired_oai_ep!="") { @@ -5208,8 +5363,11 @@ Current version: 68 { desired_claude_ep = desired_claude_ep.slice(0, -1); } - if(desired_claude_ep!="" && desired_claude_ep.length > 4 && !desired_claude_ep.slice(-4).toLowerCase().includes("/v")) { - desired_claude_ep = desired_claude_ep + "/v1"; + if (document.getElementById("claudeaddversion").checked) + { + if (desired_claude_ep != "" && desired_claude_ep.length > 4 && !desired_claude_ep.slice(-4).toLowerCase().includes("/v")) { + desired_claude_ep = desired_claude_ep + "/v1"; + } } if(desired_claude_key!="" && desired_claude_ep!="") { @@ -5249,6 +5407,37 @@ Current version: 68 } } + else if(epchoice==4) //palm endpoint + { + let desired_palm_key = document.getElementById("custom_palm_key").value.trim(); + + if(desired_palm_key!="") + { + hide_popups(); + + //good to go + custom_palm_key = desired_palm_key; + localsettings.saved_palm_key = custom_palm_key; + + selected_models = [{ "performance": 100.0, "queued": 0.0, "eta": 0, "name": "text-bison-001", "count": 1 }]; + selected_workers = []; + if (perfdata == null) { + //generate some fake perf data if horde is offline and using custom endpoint + perfdata = { + "queued_requests": 0, + "queued_tokens": 0, + "past_minute_tokens": 0, + "worker_count": 0 + }; + document.body.classList.add("connected"); + document.getElementById("connectstatus").classList.remove("color_orange"); + document.getElementById("connectstatus").classList.add("color_green"); + } + document.getElementById("connectstatus").innerHTML = "Connected to PaLM Endpoint"; + render_gametext(); + } + } + } function display_custom_endpoint() @@ -5406,6 +5595,7 @@ Current version: 68 custom_oai_key = ""; custom_scale_key = ""; custom_claude_key = ""; + custom_palm_key = ""; //remove the Custom Endpoint if it's multi selected together with others. const findex = selected_idx_arr.indexOf("9999"); if (findex > -1) { @@ -5661,11 +5851,12 @@ Current version: 68 document.getElementById("invert_colors").checked = localsettings.invert_colors; document.getElementById("trimsentences").checked = localsettings.trimsentences; document.getElementById("trimwhitespace").checked = localsettings.trimwhitespace; - document.getElementById("unban_tokens").checked = localsettings.unban_tokens; + document.getElementById("eos_ban_mode").value = localsettings.eos_ban_mode; document.getElementById("persist_session").checked = localsettings.persist_session; document.getElementById("opmode").value = localsettings.opmode; document.getElementById("chatname").value = localsettings.chatname; - document.getElementById("chatopponent").value = localsettings.chatopponent; + document.getElementById("chatopponent").value = replaceAll(localsettings.chatopponent,"||$||","\n"); + handle_bot_name_onchange(); document.getElementById("instruct_starttag").value = localsettings.instruct_starttag; document.getElementById("instruct_endtag").value = localsettings.instruct_endtag; document.getElementById("top_k").value = localsettings.top_k; @@ -5687,6 +5878,7 @@ Current version: 68 } document.getElementById("setgrammar").disabled = !is_using_kcpp_with_grammar(); + document.getElementById("grammar_retain_state").disabled = document.getElementById("setgrammar").disabled; if(custom_kobold_endpoint!="") { @@ -5703,7 +5895,7 @@ Current version: 68 document.getElementById("idle_duration").value = localsettings.idle_duration; document.getElementById("adventure_context_mod").checked = localsettings.adventure_context_mod; document.getElementById("instruct_has_markdown").checked = localsettings.instruct_has_markdown; - document.getElementById("raw_instruct_tags").checked = localsettings.raw_instruct_tags; + document.getElementById("placeholder_tags").checked = localsettings.placeholder_tags; document.getElementById("auto_ctxlen").checked = localsettings.auto_ctxlen; document.getElementById("auto_genamt").checked = localsettings.auto_genamt; if(localflag) @@ -5752,6 +5944,7 @@ Current version: 68 sdmodelshtml += "