Illumotion commited on
Commit
46c2bfc
1 Parent(s): d257d3f

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +25 -12
  2. CMakeLists.txt +26 -25
  3. Dockerfile +3 -3
  4. Makefile +69 -20
  5. Package.swift +4 -2
  6. README.md +84 -6
  7. Remote-Link.cmd +18 -2
  8. class.py +15 -13
  9. colab.ipynb +4 -6
  10. common/CMakeLists.txt +2 -0
  11. common/common.cpp +137 -60
  12. common/common.h +21 -9
  13. common/log.h +37 -37
  14. common/train.cpp +1496 -0
  15. common/train.h +230 -0
  16. convert-falcon-hf-to-gguf.py +0 -6
  17. convert-starcoder-hf-to-gguf.py +0 -6
  18. convert.py +1 -1
  19. examples/CMakeLists.txt +4 -0
  20. examples/baby-llama/baby-llama.cpp +79 -147
  21. examples/batched/CMakeLists.txt +5 -0
  22. examples/batched/README.md +44 -0
  23. examples/batched/batched.cpp +255 -0
  24. examples/beam-search/beam-search.cpp +4 -3
  25. examples/embd-input/embd-input-lib.cpp +10 -10
  26. examples/embd-input/embd-input-test.cpp +1 -1
  27. examples/embedding/embedding.cpp +11 -10
  28. examples/export-lora/CMakeLists.txt +5 -0
  29. examples/export-lora/README.md +26 -0
  30. examples/export-lora/export-lora.cpp +474 -0
  31. examples/finetune/CMakeLists.txt +5 -0
  32. examples/finetune/README.md +90 -0
  33. examples/finetune/convert-finetune-checkpoint-to-gguf.py +489 -0
  34. examples/finetune/finetune.cpp +1940 -0
  35. examples/gptneox-wip/falcon-main.cpp +2 -2
  36. examples/gptneox-wip/gptneox-main.cpp +2 -2
  37. examples/llama-bench/README.md +271 -0
  38. examples/llama-bench/llama-bench.cpp +116 -51
  39. examples/main/README.md +2 -2
  40. examples/main/main.cpp +36 -39
  41. examples/make-ggml.py +19 -14
  42. examples/parallel/CMakeLists.txt +8 -0
  43. examples/parallel/README.md +3 -0
  44. examples/parallel/parallel.cpp +380 -0
  45. examples/perplexity/README.md +18 -0
  46. examples/perplexity/perplexity.cpp +70 -46
  47. examples/quantize-stats/quantize-stats.cpp +9 -8
  48. examples/quantize/README.md +41 -0
  49. examples/quantize/quantize.cpp +1 -0
  50. examples/save-load-state/save-load-state.cpp +12 -18
.gitignore CHANGED
@@ -12,6 +12,9 @@
12
  .vs/
13
  .vscode/
14
 
 
 
 
15
  build*/
16
  out/
17
  tmp/
@@ -19,24 +22,34 @@ tmp/
19
  models/*
20
  models-mnt
21
 
22
- /main
23
- /quantize
24
- /quantize-stats
25
- /result
26
- /perplexity
27
- /embedding
28
- /train-text-from-scratch
29
- /convert-llama2c-to-ggml
30
- /simple
31
- /benchmark-matmult
32
- /vdot
33
- /server
34
  /Pipfile
 
 
 
 
35
  /embd-input-test
 
36
  /gguf
37
  /gguf-llama-simple
38
  /libllama.so
39
  /llama-bench
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  build-info.h
41
  arm_neon.h
42
  compile_commands.json
 
12
  .vs/
13
  .vscode/
14
 
15
+ lcov-report/
16
+ gcovr-report/
17
+
18
  build*/
19
  out/
20
  tmp/
 
22
  models/*
23
  models-mnt
24
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  /Pipfile
26
+ /baby-llama
27
+ /beam-search
28
+ /benchmark-matmult
29
+ /convert-llama2c-to-ggml
30
  /embd-input-test
31
+ /embedding
32
  /gguf
33
  /gguf-llama-simple
34
  /libllama.so
35
  /llama-bench
36
+ /main
37
+ /metal
38
+ /perplexity
39
+ /q8dot
40
+ /quantize
41
+ /quantize-stats
42
+ /result
43
+ /save-load-state
44
+ /server
45
+ /simple
46
+ /batched
47
+ /export-lora
48
+ /finetune
49
+ /speculative
50
+ /parallel
51
+ /train-text-from-scratch
52
+ /vdot
53
  build-info.h
54
  arm_neon.h
55
  compile_commands.json
CMakeLists.txt CHANGED
@@ -197,37 +197,38 @@ endif()
197
 
198
  if (LLAMA_ALL_WARNINGS)
199
  if (NOT MSVC)
200
- set(c_flags
201
- -Wall
202
- -Wextra
203
- -Wpedantic
204
- -Wcast-qual
205
- -Wdouble-promotion
206
- -Wshadow
207
- -Wstrict-prototypes
208
- -Wpointer-arith
209
- -Wmissing-prototypes
210
- -Werror=implicit-int
211
- -Wno-unused-function
212
- )
213
- set(cxx_flags
214
- -Wall
215
- -Wextra
216
- -Wpedantic
217
- -Wcast-qual
218
- -Wmissing-declarations
219
- -Wno-unused-function
220
- -Wno-multichar
221
- )
222
- if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
223
- # g++ only
224
- set(cxx_flags ${cxx_flags} -Wno-format-truncation -Wno-array-bounds)
225
  endif()
226
  else()
227
  # todo : msvc
228
  endif()
229
 
230
  add_compile_options(
 
231
  "$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
232
  "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
233
  )
 
197
 
198
  if (LLAMA_ALL_WARNINGS)
199
  if (NOT MSVC)
200
+ set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
201
+ set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int
202
+ -Werror=implicit-function-declaration)
203
+ set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
204
+
205
+ if (CMAKE_C_COMPILER_ID MATCHES "Clang")
206
+ set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
207
+ set(cxx_flags ${cxx_flags} -Wmissing-prototypes -Wextra-semi)
208
+
209
+ if (
210
+ (CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
211
+ (CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3.0)
212
+ )
213
+ set(c_flags ${c_flags} -Wdouble-promotion)
214
+ endif()
215
+ elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
216
+ set(c_flags ${c_flags} -Wdouble-promotion)
217
+ set(cxx_flags ${cxx_flags} -Wno-array-bounds)
218
+
219
+ if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
220
+ set(cxx_flags ${cxx_flags} -Wno-format-truncation)
221
+ endif()
222
+ if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
223
+ set(cxx_flags ${cxx_flags} -Wextra-semi)
224
+ endif()
225
  endif()
226
  else()
227
  # todo : msvc
228
  endif()
229
 
230
  add_compile_options(
231
+ ${warning_flags}
232
  "$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
233
  "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
234
  )
Dockerfile CHANGED
@@ -4,8 +4,8 @@ COPY . .
4
  RUN apt update \
5
  && apt install build-essential wget libopenblas-dev make -y \
6
  && make LLAMA_OPENBLAS=1 \
7
- && wget https://huggingface.co/TheBloke/Pygmalion-2-7B-GGUF/resolve/main/pygmalion-2-7b.Q4_0.gguf \
8
  && apt remove build-essential wget make -y \
9
- && rm -fr *.bat convert-* ci docs examples otherarchs tests
10
 
11
- ENTRYPOINT ["python", "koboldcpp.py", "pygmalion-2-7b.Q4_0.gguf", "--port", "7860"]
 
4
  RUN apt update \
5
  && apt install build-essential wget libopenblas-dev make -y \
6
  && make LLAMA_OPENBLAS=1 \
7
+ && wget https://huggingface.co/notstoic/pygmalion-13b-ggml/resolve/main/pygmalion-13b-ggml-q4_0.bin \
8
  && apt remove build-essential wget make -y \
9
+ && rm -fr *.bat convert-* ci docs examples otherarchs tests
10
 
11
+ ENTRYPOINT ["python", "koboldcpp.py", "pygmalion-13b-ggml-q4_0.bin", "--port", "7860", "--smartcontext", "--stream"]
Makefile CHANGED
@@ -39,10 +39,15 @@ endif
39
  #
40
 
41
  # keep standard at C11 and C++11
42
- CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE
43
- CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE
44
  LDFLAGS =
45
 
 
 
 
 
 
46
  # these are used on windows, to build some libraries with extra old device compatibility
47
  SIMPLECFLAGS =
48
  FULLCFLAGS =
@@ -285,19 +290,17 @@ ifeq ($(OS),Windows_NT)
285
  endif
286
  else
287
  DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS)
288
- FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS)
289
  ifdef LLAMA_OPENBLAS
290
  OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
291
- NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
292
  endif
293
  ifdef LLAMA_CLBLAST
294
  ifeq ($(UNAME_S),Darwin)
295
- CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
296
  else
297
- CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
298
  endif
299
  endif
300
-
301
  ifdef LLAMA_CUBLAS
302
  CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.so $(CUBLASLD_FLAGS) $(LDFLAGS)
303
  endif
@@ -351,12 +354,20 @@ ggml_cublas.o: ggml.c ggml.h ggml-cuda.h k_quants.h
351
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
352
 
353
  #quants K
 
 
 
 
 
 
 
354
  k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
355
  $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
356
  k_quants_noavx2.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
357
  $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
358
  k_quants_failsafe.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
359
  $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
 
360
 
361
  #there's no intrinsics or special gpu ops used here, so we can have a universal object
362
  ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
@@ -416,7 +427,7 @@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
416
  clean:
417
  rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so
418
 
419
- main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS)
420
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
421
  @echo
422
  @echo '==== Run ./main -h for help. ===='
@@ -425,31 +436,69 @@ main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o
425
  gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
426
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
427
 
 
428
  #generated libraries
429
- koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
430
  $(DEFAULT_BUILD)
431
- koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
 
 
432
  $(OPENBLAS_BUILD)
433
- koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o ggml-alloc.o grammar-parser.o $(OBJS)
 
 
 
 
 
 
434
  $(FAILSAFE_BUILD)
435
- koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o ggml-alloc.o grammar-parser.o $(OBJS)
 
 
 
 
 
 
436
  $(NOAVX2_BUILD)
437
- koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
 
 
 
 
 
 
438
  $(CLBLAST_BUILD)
439
- koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
 
 
 
 
 
 
440
  $(CUBLAS_BUILD)
441
- koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(HIP_OBJS) $(OBJS)
 
 
 
 
 
 
442
  $(HIPBLAS_BUILD)
 
 
 
 
443
 
444
- quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-alloc.o
 
445
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
446
- quantize_gptj: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
447
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
448
- quantize_gpt2: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
449
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
450
- quantize_neox: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
451
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
452
- quantize_mpt: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
453
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
454
 
455
 
 
39
  #
40
 
41
  # keep standard at C11 and C++11
42
+ CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
43
+ CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
44
  LDFLAGS =
45
 
46
+ ifndef LLAMA_NO_K_QUANTS
47
+ CFLAGS += -DGGML_USE_K_QUANTS
48
+ CXXFLAGS += -DGGML_USE_K_QUANTS
49
+ endif
50
+
51
  # these are used on windows, to build some libraries with extra old device compatibility
52
  SIMPLECFLAGS =
53
  FULLCFLAGS =
 
290
  endif
291
  else
292
  DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS)
293
+
294
  ifdef LLAMA_OPENBLAS
295
  OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
 
296
  endif
297
  ifdef LLAMA_CLBLAST
298
  ifeq ($(UNAME_S),Darwin)
299
+ CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
300
  else
301
+ CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS)
302
  endif
303
  endif
 
304
  ifdef LLAMA_CUBLAS
305
  CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.so $(CUBLASLD_FLAGS) $(LDFLAGS)
306
  endif
 
354
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
355
 
356
  #quants K
357
+ KQ1 =
358
+ KQ2 =
359
+ KQ3 =
360
+ ifndef LLAMA_NO_K_QUANTS
361
+ KQ1 = k_quants.o
362
+ KQ2 = k_quants_noavx2.o
363
+ KQ3 = k_quants_failsafe.o
364
  k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
365
  $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
366
  k_quants_noavx2.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
367
  $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
368
  k_quants_failsafe.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
369
  $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
370
+ endif # LLAMA_NO_K_QUANTS
371
 
372
  #there's no intrinsics or special gpu ops used here, so we can have a universal object
373
  ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 
427
  clean:
428
  rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so
429
 
430
+ main: examples/main/main.cpp build-info.h ggml.o $(KQ1) ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS)
431
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
432
  @echo
433
  @echo '==== Run ./main -h for help. ===='
 
436
  gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
437
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
438
 
439
+
440
  #generated libraries
441
+ koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
442
  $(DEFAULT_BUILD)
443
+
444
+ ifdef OPENBLAS_BUILD
445
+ koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
446
  $(OPENBLAS_BUILD)
447
+ else
448
+ koboldcpp_openblas:
449
+ $(DONOTHING)
450
+ endif
451
+
452
+ ifdef FAILSAFE_BUILD
453
+ koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ3) ggml-alloc.o grammar-parser.o $(OBJS)
454
  $(FAILSAFE_BUILD)
455
+ else
456
+ koboldcpp_failsafe:
457
+ $(DONOTHING)
458
+ endif
459
+
460
+ ifdef NOAVX2_BUILD
461
+ koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ2) ggml-alloc.o grammar-parser.o $(OBJS)
462
  $(NOAVX2_BUILD)
463
+ else
464
+ koboldcpp_noavx2:
465
+ $(DONOTHING)
466
+ endif
467
+
468
+ ifdef CLBLAST_BUILD
469
+ koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
470
  $(CLBLAST_BUILD)
471
+ else
472
+ koboldcpp_clblast:
473
+ $(DONOTHING)
474
+ endif
475
+
476
+ ifdef CUBLAS_BUILD
477
+ koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
478
  $(CUBLAS_BUILD)
479
+ else
480
+ koboldcpp_cublas:
481
+ $(DONOTHING)
482
+ endif
483
+
484
+ ifdef HIPBLAS_BUILD
485
+ koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o grammar-parser.o $(HIP_OBJS) $(OBJS)
486
  $(HIPBLAS_BUILD)
487
+ else
488
+ koboldcpp_hipblas:
489
+ $(DONOTHING)
490
+ endif
491
 
492
+ # tools
493
+ quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o $(KQ1) ggml-alloc.o
494
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
495
+ quantize_gptj: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
496
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
497
+ quantize_gpt2: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
498
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
499
+ quantize_neox: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
500
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
501
+ quantize_mpt: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
502
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
503
 
504
 
Package.swift CHANGED
@@ -10,7 +10,7 @@ let platforms: [SupportedPlatform]? = [
10
  .tvOS(.v14)
11
  ]
12
  let exclude: [String] = []
13
- let additionalSources: [String] = ["ggml-metal.m"]
14
  let additionalSettings: [CSetting] = [
15
  .unsafeFlags(["-fno-objc-arc"]),
16
  .define("GGML_SWIFT"),
@@ -44,7 +44,9 @@ let package = Package(
44
  cSettings: [
45
  .unsafeFlags(["-Wno-shorten-64-to-32"]),
46
  .define("GGML_USE_K_QUANTS"),
47
- .define("GGML_USE_ACCELERATE")
 
 
48
  ] + additionalSettings,
49
  linkerSettings: [
50
  .linkedFramework("Accelerate")
 
10
  .tvOS(.v14)
11
  ]
12
  let exclude: [String] = []
13
+ let additionalSources: [String] = ["ggml-metal.m", "ggml-metal.metal"]
14
  let additionalSettings: [CSetting] = [
15
  .unsafeFlags(["-fno-objc-arc"]),
16
  .define("GGML_SWIFT"),
 
44
  cSettings: [
45
  .unsafeFlags(["-Wno-shorten-64-to-32"]),
46
  .define("GGML_USE_K_QUANTS"),
47
+ .define("GGML_USE_ACCELERATE"),
48
+ .define("ACCELERATE_NEW_LAPACK"),
49
+ .define("ACCELERATE_LAPACK_ILP64")
50
  ] + additionalSettings,
51
  linkerSettings: [
52
  .linkedFramework("Accelerate")
README.md CHANGED
@@ -1,6 +1,84 @@
1
- ---
2
- sdk: docker
3
- emoji: 🚀
4
- colorFrom: yellow
5
- colorTo: blue
6
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # koboldcpp
2
+
3
+ KoboldCpp is an easy-to-use AI text-generation software for GGML models. It's a single self contained distributable from Concedo, that builds off llama.cpp, and adds a versatile Kobold API endpoint, additional format support, backward compatibility, as well as a fancy UI with persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything Kobold and Kobold Lite have to offer.
4
+
5
+ ![Preview](media/preview.png)
6
+
7
+ ## Usage
8
+ - **[Download the latest .exe release here](https://github.com/LostRuins/koboldcpp/releases/latest)** or clone the git repo.
9
+ - Windows binaries are provided in the form of **koboldcpp.exe**, which is a pyinstaller wrapper for a few **.dll** files and **koboldcpp.py**. If you feel concerned, you may prefer to rebuild it yourself with the provided makefiles and scripts.
10
+ - Weights are not included, you can use the official llama.cpp `quantize.exe` to generate them from your official weight files (or download them from other places such as [TheBloke's Huggingface](https://huggingface.co/TheBloke).
11
+ - To run, execute **koboldcpp.exe** or drag and drop your quantized `ggml_model.bin` file onto the .exe, and then connect with Kobold or Kobold Lite. If you're not on windows, then run the script **KoboldCpp.py** after compiling the libraries.
12
+ - Launching with no command line arguments displays a GUI containing a subset of configurable settings. Generally you dont have to change much besides the `Presets` and `GPU Layers`. Read the `--help` for more info about each settings.
13
+ - By default, you can connect to http://localhost:5001
14
+ - You can also run it using the command line `koboldcpp.exe [ggml_model.bin] [port]`. For info, please check `koboldcpp.exe --help`
15
+ - Default context size to small? Try `--contextsize 3072` to 1.5x your context size! without much perplexity gain. Note that you'll have to increase the max context in the Kobold Lite UI as well (click and edit the number text field).
16
+ - Big context too slow? Try the `--smartcontext` flag to reduce prompt processing frequency. Also, you can try to run with your GPU using CLBlast, with `--useclblast` flag for a speedup
17
+ - Want even more speedup? Combine `--useclblast` with `--gpulayers` to offload entire layers to the GPU! **Much faster, but uses more VRAM**. Experiment to determine number of layers to offload, and reduce by a few if you run out of memory.
18
+ - If you are having crashes or issues, you can try turning off BLAS with the `--noblas` flag. You can also try running in a non-avx2 compatibility mode with `--noavx2`. Lastly, you can try turning off mmap with `--nommap`.
19
+
20
+ For more information, be sure to run the program with the `--help` flag.
21
+
22
+ ## OSX and Linux
23
+ - You will have to compile your binaries from source. A makefile is provided, simply run `make`
24
+ - If you want you can also link your own install of OpenBLAS manually with `make LLAMA_OPENBLAS=1`
25
+ - Alternatively, if you want you can also link your own install of CLBlast manually with `make LLAMA_CLBLAST=1`, for this you will need to obtain and link OpenCL and CLBlast libraries.
26
+ - For Arch Linux: Install `cblas` `openblas` and `clblast`.
27
+ - For Debian: Install `libclblast-dev` and `libopenblas-dev`.
28
+ - For a full featured build, do `make LLAMA_OPENBLAS=1 LLAMA_CLBLAST=1 LLAMA_CUBLAS=1`
29
+ - After all binaries are built, you can run the python script with the command `koboldcpp.py [ggml_model.bin] [port]`
30
+ - Note: Many OSX users have found that the using Accelerate is actually faster than OpenBLAS. To try, you may wish to run with `--noblas` and compare speeds.
31
+
32
+ ## Compiling on Windows
33
+ - You're encouraged to use the .exe released, but if you want to compile your binaries from source at Windows, the easiest way is:
34
+ - Use the latest release of w64devkit (https://github.com/skeeto/w64devkit). Be sure to use the "vanilla one", not i686 or other different stuff. If you try they will conflit with the precompiled libs!
35
+ - Make sure you are using the w64devkit integrated terminal, then run 'make' at the KoboldCpp source folder. This will create the .dll files.
36
+ - If you want to generate the .exe file, make sure you have the python module PyInstaller installed with pip ('pip install PyInstaller').
37
+ - Run the script make_pyinstaller.bat at a regular terminal (or Windows Explorer).
38
+ - The koboldcpp.exe file will be at your dist folder.
39
+ - If you wish to use your own version of the additional Windows libraries (OpenCL, CLBlast and OpenBLAS), you can do it with:
40
+ - OpenCL - tested with https://github.com/KhronosGroup/OpenCL-SDK . If you wish to compile it, follow the repository instructions. You will need vcpkg.
41
+ - CLBlast - tested with https://github.com/CNugteren/CLBlast . If you wish to compile it you will need to reference the OpenCL files. It will only generate the ".lib" file if you compile using MSVC.
42
+ - OpenBLAS - tested with https://github.com/xianyi/OpenBLAS .
43
+ - Move the respectives .lib files to the /lib folder of your project, overwriting the older files.
44
+ - Also, replace the existing versions of the corresponding .dll files located in the project directory root (e.g. libopenblas.dll).
45
+ - Make the KoboldCPP project using the instructions above.
46
+
47
+ ## Android (Termux) Alternative method
48
+ - See https://github.com/ggerganov/llama.cpp/pull/1828/files
49
+
50
+ ## Using CuBLAS
51
+ - If you're on Windows with an Nvidia GPU you can get CUDA support out of the box using the `--usecublas` flag, make sure you select the correct .exe with CUDA support.
52
+ - You can attempt a CuBLAS build with `LLAMA_CUBLAS=1` or using the provided CMake file (best for visual studio users). If you use the CMake file to build, copy the `koboldcpp_cublas.dll` generated into the same directory as the `koboldcpp.py` file. If you are bundling executables, you may need to include CUDA dynamic libraries (such as `cublasLt64_11.dll` and `cublas64_11.dll`) in order for the executable to work correctly on a different PC.
53
+
54
+ ## AMD
55
+ - Please check out https://github.com/YellowRoseCx/koboldcpp-rocm
56
+
57
+ ## Questions and Help
58
+ - **First, please check out [The KoboldCpp FAQ and Knowledgebase](https://github.com/LostRuins/koboldcpp/wiki) which may already have answers to your questions! Also please search through past issues and discussions.**
59
+ - If you cannot find an answer, open an issue on this github, or find us on the [KoboldAI Discord](https://koboldai.org/discord).
60
+
61
+ ## Considerations
62
+ - For Windows: No installation, single file executable, (It Just Works)
63
+ - Since v1.0.6, requires libopenblas, the prebuilt windows binaries are included in this repo. If not found, it will fall back to a mode without BLAS.
64
+ - Since v1.15, requires CLBlast if enabled, the prebuilt windows binaries are included in this repo. If not found, it will fall back to a mode without CLBlast.
65
+ - Since v1.33, you can set the context size to be above what the model supports officially. It does increases perplexity but should still work well below 4096 even on untuned models. (For GPT-NeoX, GPT-J, and LLAMA models) Customize this with `--ropeconfig`.
66
+ - **I plan to keep backwards compatibility with ALL past llama.cpp AND alpaca.cpp models**. But you are also encouraged to reconvert/update your models if possible for best results.
67
+
68
+ ## License
69
+ - The original GGML library and llama.cpp by ggerganov are licensed under the MIT License
70
+ - However, Kobold Lite is licensed under the AGPL v3.0 License
71
+ - The other files are also under the AGPL v3.0 License unless otherwise stated
72
+
73
+ ## Notes
74
+ - Generation delay scales linearly with original prompt length. If OpenBLAS is enabled then prompt ingestion becomes about 2-3x faster. This is automatic on windows, but will require linking on OSX and Linux. CLBlast speeds this up even further, and `--gpulayers` + `--useclblast` or `--usecublas` more so.
75
+ - I have heard of someone claiming a false AV positive report. The exe is a simple pyinstaller bundle that includes the necessary python scripts and dlls to run. If this still concerns you, you might wish to rebuild everything from source code using the makefile, and you can rebuild the exe yourself with pyinstaller by using `make_pyinstaller.bat`
76
+ - Supported GGML models (Includes backward compatibility for older versions/legacy GGML models, though some newer features might be unavailable):
77
+ - LLAMA and LLAMA2 (LLaMA / Alpaca / GPT4All / Vicuna / Koala / Pygmalion 7B / Metharme 7B / WizardLM and many more)
78
+ - GPT-2 / Cerebras
79
+ - GPT-J
80
+ - RWKV
81
+ - GPT-NeoX / Pythia / StableLM / Dolly / RedPajama
82
+ - MPT models
83
+ - Falcon (GGUF only)
84
+
Remote-Link.cmd CHANGED
@@ -1,2 +1,18 @@
1
- curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -o cloudflared.exe
2
- cloudflared.exe tunnel --url localhost:5001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ : # This script will help setup a cloudflared tunnel for accessing KoboldCpp over the internet
2
+ : # It should work out of the box on both linux and windows
3
+ : # ======
4
+ : # WINDOWS PORTION
5
+ :<<BATCH
6
+ @echo off
7
+ echo Starting Cloudflare Tunnel for Windows
8
+ curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -o cloudflared.exe
9
+ cloudflared.exe tunnel --url localhost:5001
10
+ GOTO ENDING
11
+ BATCH
12
+ : # LINUX PORTION
13
+ echo 'Starting Cloudflare Tunnel for Linux'
14
+ curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -o 'cloudflared-linux-amd64' #
15
+ chmod +x 'cloudflared-linux-amd64' #
16
+ ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 #
17
+ exit #
18
+ :ENDING
class.py CHANGED
@@ -188,6 +188,18 @@ class model_backend(InferenceModel):
188
  "extra_classes": "",
189
  'children': [{'text': 'False', 'value': False}, {'text': 'True', 'value': True}],
190
  })
 
 
 
 
 
 
 
 
 
 
 
 
191
  requested_parameters.append({
192
  "uitype": "dropdown",
193
  "unit": "int",
@@ -202,18 +214,6 @@ class model_backend(InferenceModel):
202
  "extra_classes": "",
203
  'children': [{'text': 'False', 'value': 0}, {'text': 'True', 'value': 1}],
204
  })
205
- requested_parameters.append({
206
- "uitype": "text",
207
- "unit": "text",
208
- "label": "Tensor Split",
209
- "id": "kcpp_tensor_split_str",
210
- "default": self.kcpp_tensor_split_str,
211
- "check": {"value": "", 'check': "!="},
212
- "tooltip": "Tensor Split, values are space separated",
213
- "menu_path": "",
214
- "refresh_model_inputs": False,
215
- "extra_classes": ""
216
- })
217
  return requested_parameters
218
 
219
  def set_input_parameters(self, parameters):
@@ -232,6 +232,7 @@ class model_backend(InferenceModel):
232
  self.kcpp_tensor_split = []
233
  for s in splits:
234
  self.kcpp_tensor_split.append(int(s))
 
235
 
236
  accel = parameters["kcpp_accelerator"]
237
  if accel==0:
@@ -271,7 +272,8 @@ class model_backend(InferenceModel):
271
  blasbatchsize=self.kcpp_blasbatchsize, ropeconfig=[self.kcpp_ropescale, self.kcpp_ropebase], stream=False, smartcontext=self.kcpp_smartcontext,
272
  unbantokens=False, bantokens=None, usemirostat=None, forceversion=0, nommap=self.kcpp_nommap,
273
  usemlock=False, noavx2=self.kcpp_noavx2, debugmode=self.kcpp_debugmode, skiplauncher=True, hordeconfig=None, noblas=self.kcpp_noblas,
274
- useclblast=self.kcpp_useclblast, usecublas=self.kcpp_usecublas, gpulayers=self.kcpp_gpulayers, tensor_split=self.kcpp_tensor_split, config=None, onready='', multiuser=False)
 
275
 
276
  koboldcpp.main(kcppargs,False) #initialize library without enabling Lite http server
277
  kcpp_backend_loaded = True
 
188
  "extra_classes": "",
189
  'children': [{'text': 'False', 'value': False}, {'text': 'True', 'value': True}],
190
  })
191
+ requested_parameters.append({
192
+ "uitype": "text",
193
+ "unit": "text",
194
+ "label": "GPU ID",
195
+ "id": "kcpp_tensor_split_str",
196
+ "default": "1",
197
+ "check": {"value": "", 'check': "!="},
198
+ "tooltip": "Which GPU's do we use? For example:1 2",
199
+ "menu_path": "",
200
+ "refresh_model_inputs": False,
201
+ "extra_classes": ""
202
+ })
203
  requested_parameters.append({
204
  "uitype": "dropdown",
205
  "unit": "int",
 
214
  "extra_classes": "",
215
  'children': [{'text': 'False', 'value': 0}, {'text': 'True', 'value': 1}],
216
  })
 
 
 
 
 
 
 
 
 
 
 
 
217
  return requested_parameters
218
 
219
  def set_input_parameters(self, parameters):
 
232
  self.kcpp_tensor_split = []
233
  for s in splits:
234
  self.kcpp_tensor_split.append(int(s))
235
+ print(self.kcpp_tensor_split)
236
 
237
  accel = parameters["kcpp_accelerator"]
238
  if accel==0:
 
272
  blasbatchsize=self.kcpp_blasbatchsize, ropeconfig=[self.kcpp_ropescale, self.kcpp_ropebase], stream=False, smartcontext=self.kcpp_smartcontext,
273
  unbantokens=False, bantokens=None, usemirostat=None, forceversion=0, nommap=self.kcpp_nommap,
274
  usemlock=False, noavx2=self.kcpp_noavx2, debugmode=self.kcpp_debugmode, skiplauncher=True, hordeconfig=None, noblas=self.kcpp_noblas,
275
+ useclblast=self.kcpp_useclblast, usecublas=self.kcpp_usecublas, gpulayers=self.kcpp_gpulayers, tensor_split=self.kcpp_tensor_split, config=None,
276
+ onready='', multiuser=False, foreground=False)
277
 
278
  koboldcpp.main(kcppargs,False) #initialize library without enabling Lite http server
279
  kcpp_backend_loaded = True
colab.ipynb CHANGED
@@ -6,7 +6,7 @@
6
  "private_outputs": true,
7
  "provenance": [],
8
  "gpuType": "T4",
9
- "authorship_tag": "ABX9TyOv14c2MWENhO6RJ3uy6vD7",
10
  "include_colab_link": true
11
  },
12
  "kernelspec": {
@@ -25,9 +25,7 @@
25
  "id": "view-in-github",
26
  "colab_type": "text"
27
  },
28
- "source": [
29
- "<a href=\"https://colab.research.google.com/github/henk717/koboldcpp/blob/concedo/colab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
30
- ]
31
  },
32
  {
33
  "cell_type": "code",
@@ -40,7 +38,7 @@
40
  "source": [
41
  "#@title <b>v-- Enter your model below and then click this to start Koboldcpp</b>\n",
42
  "\n",
43
- "Model = \"https://huggingface.co/TheBloke/airoboros-l2-13B-gpt4-1.4.1-GGML/resolve/main/airoboros-l2-13b-gpt4-1.4.1.ggmlv3.q4_0.bin\" #@param [\"\"]{allow-input: true}\n",
44
  "Layers = 43 #@param [43]{allow-input: true}\n",
45
  "\n",
46
  "%cd /content\n",
@@ -54,7 +52,7 @@
54
  "!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\n",
55
  "!sleep 10\n",
56
  "!cat nohup.out\n",
57
- "!python koboldcpp.py model.ggml --stream --usecublas 0 --gpulayers $Layers --hordeconfig concedo\n"
58
  ]
59
  }
60
  ]
 
6
  "private_outputs": true,
7
  "provenance": [],
8
  "gpuType": "T4",
9
+ "authorship_tag": "",
10
  "include_colab_link": true
11
  },
12
  "kernelspec": {
 
25
  "id": "view-in-github",
26
  "colab_type": "text"
27
  },
28
+ "source": []
 
 
29
  },
30
  {
31
  "cell_type": "code",
 
38
  "source": [
39
  "#@title <b>v-- Enter your model below and then click this to start Koboldcpp</b>\n",
40
  "\n",
41
+ "Model = \"https://huggingface.co/TheBloke/Airoboros-L2-13B-2.2-GGUF/resolve/main/airoboros-l2-13b-2.2.Q4_K_M.gguf\" #@param [\"\"]{allow-input: true}\n",
42
  "Layers = 43 #@param [43]{allow-input: true}\n",
43
  "\n",
44
  "%cd /content\n",
 
52
  "!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\n",
53
  "!sleep 10\n",
54
  "!cat nohup.out\n",
55
+ "!python koboldcpp.py model.ggml --stream --usecublas 0 mmq --gpulayers $Layers --hordeconfig concedo\n"
56
  ]
57
  }
58
  ]
common/CMakeLists.txt CHANGED
@@ -9,6 +9,8 @@ add_library(${TARGET} OBJECT
9
  console.cpp
10
  grammar-parser.h
11
  grammar-parser.cpp
 
 
12
  )
13
 
14
  if (BUILD_SHARED_LIBS)
 
9
  console.cpp
10
  grammar-parser.h
11
  grammar-parser.cpp
12
+ train.h
13
+ train.cpp
14
  )
15
 
16
  if (BUILD_SHARED_LIBS)
common/common.cpp CHANGED
@@ -78,7 +78,7 @@ int32_t get_num_physical_cores() {
78
  return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
79
  }
80
 
81
- static void process_escapes(std::string& input) {
82
  std::size_t input_len = input.length();
83
  std::size_t output_idx = 0;
84
 
@@ -129,6 +129,15 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
129
  if (params.n_threads <= 0) {
130
  params.n_threads = std::thread::hardware_concurrency();
131
  }
 
 
 
 
 
 
 
 
 
132
  } else if (arg == "-p" || arg == "--prompt") {
133
  if (++i >= argc) {
134
  invalid_param = true;
@@ -317,6 +326,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
317
  break;
318
  }
319
  params.n_chunks = std::stoi(argv[i]);
 
 
 
 
 
 
 
 
 
 
 
 
320
  } else if (arg == "-m" || arg == "--model") {
321
  if (++i >= argc) {
322
  invalid_param = true;
@@ -340,7 +361,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
340
  invalid_param = true;
341
  break;
342
  }
343
- params.lora_adapter = argv[i];
 
 
 
 
 
 
 
 
 
 
 
 
344
  params.use_mmap = false;
345
  } else if (arg == "--lora-base") {
346
  if (++i >= argc) {
@@ -360,6 +393,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
360
  params.multiline_input = true;
361
  } else if (arg == "--simple-io") {
362
  params.simple_io = true;
 
 
363
  } else if (arg == "--color") {
364
  params.use_color = true;
365
  } else if (arg == "--mlock") {
@@ -425,19 +460,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
425
  params.mul_mat_q = false;
426
  #else
427
  fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
428
- #endif // GGML_USE_CUBLAS
429
- } else if (arg == "--low-vram" || arg == "-lv") {
430
- #ifdef GGML_USE_CUBLAS
431
- params.low_vram = true;
432
- #else
433
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
434
  #endif // GGML_USE_CUBLAS
435
  } else if (arg == "--no-mmap") {
436
  params.use_mmap = false;
437
  } else if (arg == "--numa") {
438
  params.numa = true;
439
- } else if (arg == "--export") {
440
- params.export_cgraph = true;
441
  } else if (arg == "--verbose-prompt") {
442
  params.verbose_prompt = true;
443
  } else if (arg == "-r" || arg == "--reverse-prompt") {
@@ -456,8 +483,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
456
  if (params.logdir.back() != DIRECTORY_SEPARATOR) {
457
  params.logdir += DIRECTORY_SEPARATOR;
458
  }
459
- } else if (arg == "--perplexity") {
460
- params.perplexity = true;
461
  } else if (arg == "--ppl-stride") {
462
  if (++i >= argc) {
463
  invalid_param = true;
@@ -606,7 +633,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
606
  printf(" (can be specified more than once for multiple prompts).\n");
607
  printf(" --color colorise output to distinguish prompt and user input from generations\n");
608
  printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
609
- printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
 
 
610
  printf(" -p PROMPT, --prompt PROMPT\n");
611
  printf(" prompt to start generation with (default: empty)\n");
612
  printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
@@ -621,7 +650,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
621
  printf(" -f FNAME, --file FNAME\n");
622
  printf(" prompt file to start generation.\n");
623
  printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
624
- printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
625
  printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
626
  printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
627
  printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
@@ -655,12 +684,15 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
655
  printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
656
  printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
657
  printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
658
- printf(" --perplexity compute perplexity over each ctx window of the prompt\n");
659
  printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
660
  printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
661
  printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
662
  printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
663
  printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
 
 
 
664
  if (llama_mlock_supported()) {
665
  printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
666
  }
@@ -678,17 +710,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
678
  printf(" -ts SPLIT --tensor-split SPLIT\n");
679
  printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
680
  printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
681
- printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n");
682
  #ifdef GGML_USE_CUBLAS
683
  printf(" -nommq, --no-mul-mat-q\n");
684
  printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
685
  printf(" Not recommended since this is both slower and uses more VRAM.\n");
686
  #endif // GGML_USE_CUBLAS
687
  #endif
688
- printf(" --export export the computation graph to 'llama.ggml'\n");
689
  printf(" --verbose-prompt print prompt before generation\n");
690
  fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
691
  printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
 
692
  printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
693
  printf(" -m FNAME, --model FNAME\n");
694
  printf(" model path (default: %s)\n", params.model.c_str());
@@ -699,6 +730,18 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
699
  printf("\n");
700
  }
701
 
 
 
 
 
 
 
 
 
 
 
 
 
702
  std::string gpt_random_prompt(std::mt19937 & rng) {
703
  const int r = rng() % 10;
704
  switch (r) {
@@ -712,60 +755,74 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
712
  case 7: return "He";
713
  case 8: return "She";
714
  case 9: return "They";
715
- default: return "To";
716
  }
717
 
718
- return "The";
719
  }
720
 
721
  //
722
  // Model utils
723
  //
724
 
725
- struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
726
- auto lparams = llama_context_default_params();
727
 
728
- lparams.n_ctx = params.n_ctx;
729
- lparams.n_batch = params.n_batch;
730
  if (params.n_gpu_layers != -1) {
731
- lparams.n_gpu_layers = params.n_gpu_layers;
732
  }
733
- lparams.main_gpu = params.main_gpu;
734
- lparams.tensor_split = params.tensor_split;
735
- lparams.low_vram = params.low_vram;
736
- lparams.mul_mat_q = params.mul_mat_q;
737
- lparams.seed = params.seed;
738
- lparams.f16_kv = params.memory_f16;
739
- lparams.use_mmap = params.use_mmap;
740
- lparams.use_mlock = params.use_mlock;
741
- lparams.logits_all = params.perplexity;
742
- lparams.embedding = params.embedding;
743
- lparams.rope_freq_base = params.rope_freq_base;
744
- lparams.rope_freq_scale = params.rope_freq_scale;
745
-
746
- return lparams;
 
 
 
 
 
 
 
 
 
 
747
  }
748
 
749
  std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
750
- auto lparams = llama_context_params_from_gpt_params(params);
751
 
752
- llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
753
  if (model == NULL) {
754
  fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
755
  return std::make_tuple(nullptr, nullptr);
756
  }
757
 
758
- llama_context * lctx = llama_new_context_with_model(model, lparams);
 
 
759
  if (lctx == NULL) {
760
  fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
761
  llama_free_model(model);
762
  return std::make_tuple(nullptr, nullptr);
763
  }
764
 
765
- if (!params.lora_adapter.empty()) {
 
 
766
  int err = llama_model_apply_lora_from_file(model,
767
- params.lora_adapter.c_str(),
768
- params.lora_base.empty() ? NULL : params.lora_base.c_str(),
 
 
 
769
  params.n_threads);
770
  if (err != 0) {
771
  fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
@@ -782,8 +839,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
782
  {
783
  LOG("warming up the model with an empty run\n");
784
 
785
- const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
786
- llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
 
787
  llama_reset_timings(lctx);
788
  }
789
 
@@ -795,16 +853,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
795
  //
796
 
797
  std::vector<llama_token> llama_tokenize(
798
- struct llama_context * ctx,
 
 
 
 
 
 
 
799
  const std::string & text,
800
  bool add_bos) {
801
  // upper limit for the number of tokens
802
  int n_tokens = text.length() + add_bos;
803
  std::vector<llama_token> result(n_tokens);
804
- n_tokens = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
805
  if (n_tokens < 0) {
806
  result.resize(-n_tokens);
807
- int check = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
808
  GGML_ASSERT(check == -n_tokens);
809
  } else {
810
  result.resize(n_tokens);
@@ -814,10 +879,10 @@ std::vector<llama_token> llama_tokenize(
814
 
815
  std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
816
  std::vector<char> result(8, 0);
817
- const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
818
  if (n_tokens < 0) {
819
  result.resize(-n_tokens);
820
- int check = llama_token_to_piece(ctx, token, result.data(), result.size());
821
  GGML_ASSERT(check == -n_tokens);
822
  } else {
823
  result.resize(n_tokens);
@@ -872,7 +937,7 @@ llama_token llama_sample_token(
872
  std::vector<llama_token_data> & candidates,
873
  int idx) {
874
  const int n_ctx = llama_n_ctx(ctx);
875
- const int n_vocab = llama_n_vocab(ctx);
876
 
877
  const float temp = params.temp;
878
  const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
@@ -890,7 +955,7 @@ llama_token llama_sample_token(
890
 
891
  llama_token id = 0;
892
 
893
- float * logits = llama_get_logits(ctx) + idx * n_vocab;
894
 
895
  // Apply params.logit_bias map
896
  for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
@@ -941,11 +1006,11 @@ llama_token llama_sample_token(
941
  if (mirostat == 1) {
942
  static float mirostat_mu = 2.0f * mirostat_tau;
943
  const int mirostat_m = 100;
944
- llama_sample_temperature(ctx, &cur_p, temp);
945
  id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
946
  } else if (mirostat == 2) {
947
  static float mirostat_mu = 2.0f * mirostat_tau;
948
- llama_sample_temperature(ctx, &cur_p, temp);
949
  id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
950
  } else {
951
  // Temperature sampling
@@ -953,7 +1018,7 @@ llama_token llama_sample_token(
953
  llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
954
  llama_sample_typical (ctx, &cur_p, typical_p, 1);
955
  llama_sample_top_p (ctx, &cur_p, top_p, 1);
956
- llama_sample_temperature(ctx, &cur_p, temp);
957
 
958
  {
959
  const int n_top = 10;
@@ -1158,7 +1223,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
1158
  #endif // NDEBUG
1159
 
1160
  fprintf(stream, "model_desc: %s\n", model_desc);
1161
- fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(lctx));
1162
 
1163
  #ifdef __OPTIMIZE__
1164
  fprintf(stream, "optimize: true\n");
@@ -1182,7 +1247,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
1182
  fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
1183
  fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
1184
  fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
1185
- fprintf(stream, "export: %s # default: false\n", params.export_cgraph ? "true" : "false");
1186
  fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
1187
  fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
1188
  dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
@@ -1211,9 +1275,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
1211
  fprintf(stream, " %d: %f", lb.first, lb.second);
1212
  }
1213
 
1214
- fprintf(stream, "lora: %s\n", params.lora_adapter.c_str());
 
 
 
 
 
 
 
 
 
 
 
 
 
1215
  fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
1216
- fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
1217
  fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
1218
  fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
1219
  fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
@@ -1256,6 +1332,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
1256
  fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
1257
  fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
1258
  fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
 
1259
  fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
1260
 
1261
  const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
 
78
  return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
79
  }
80
 
81
+ void process_escapes(std::string& input) {
82
  std::size_t input_len = input.length();
83
  std::size_t output_idx = 0;
84
 
 
129
  if (params.n_threads <= 0) {
130
  params.n_threads = std::thread::hardware_concurrency();
131
  }
132
+ } else if (arg == "-tb" || arg == "--threads-batch") {
133
+ if (++i >= argc) {
134
+ invalid_param = true;
135
+ break;
136
+ }
137
+ params.n_threads_batch = std::stoi(argv[i]);
138
+ if (params.n_threads_batch <= 0) {
139
+ params.n_threads_batch = std::thread::hardware_concurrency();
140
+ }
141
  } else if (arg == "-p" || arg == "--prompt") {
142
  if (++i >= argc) {
143
  invalid_param = true;
 
326
  break;
327
  }
328
  params.n_chunks = std::stoi(argv[i]);
329
+ } else if (arg == "-np" || arg == "--parallel") {
330
+ if (++i >= argc) {
331
+ invalid_param = true;
332
+ break;
333
+ }
334
+ params.n_parallel = std::stoi(argv[i]);
335
+ } else if (arg == "-ns" || arg == "--sequences") {
336
+ if (++i >= argc) {
337
+ invalid_param = true;
338
+ break;
339
+ }
340
+ params.n_sequences = std::stoi(argv[i]);
341
  } else if (arg == "-m" || arg == "--model") {
342
  if (++i >= argc) {
343
  invalid_param = true;
 
361
  invalid_param = true;
362
  break;
363
  }
364
+ params.lora_adapter.push_back({argv[i], 1.0f});
365
+ params.use_mmap = false;
366
+ } else if (arg == "--lora-scaled") {
367
+ if (++i >= argc) {
368
+ invalid_param = true;
369
+ break;
370
+ }
371
+ const char * lora_adapter = argv[i];
372
+ if (++i >= argc) {
373
+ invalid_param = true;
374
+ break;
375
+ }
376
+ params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
377
  params.use_mmap = false;
378
  } else if (arg == "--lora-base") {
379
  if (++i >= argc) {
 
393
  params.multiline_input = true;
394
  } else if (arg == "--simple-io") {
395
  params.simple_io = true;
396
+ } else if (arg == "-cb" || arg == "--cont-batching") {
397
+ params.cont_batching = true;
398
  } else if (arg == "--color") {
399
  params.use_color = true;
400
  } else if (arg == "--mlock") {
 
460
  params.mul_mat_q = false;
461
  #else
462
  fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
 
 
 
 
 
 
463
  #endif // GGML_USE_CUBLAS
464
  } else if (arg == "--no-mmap") {
465
  params.use_mmap = false;
466
  } else if (arg == "--numa") {
467
  params.numa = true;
 
 
468
  } else if (arg == "--verbose-prompt") {
469
  params.verbose_prompt = true;
470
  } else if (arg == "-r" || arg == "--reverse-prompt") {
 
483
  if (params.logdir.back() != DIRECTORY_SEPARATOR) {
484
  params.logdir += DIRECTORY_SEPARATOR;
485
  }
486
+ } else if (arg == "--perplexity" || arg == "--all-logits") {
487
+ params.logits_all = true;
488
  } else if (arg == "--ppl-stride") {
489
  if (++i >= argc) {
490
  invalid_param = true;
 
633
  printf(" (can be specified more than once for multiple prompts).\n");
634
  printf(" --color colorise output to distinguish prompt and user input from generations\n");
635
  printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
636
+ printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
637
+ printf(" -tb N, --threads-batch N\n");
638
+ printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
639
  printf(" -p PROMPT, --prompt PROMPT\n");
640
  printf(" prompt to start generation with (default: empty)\n");
641
  printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
 
650
  printf(" -f FNAME, --file FNAME\n");
651
  printf(" prompt file to start generation.\n");
652
  printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
653
+ printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
654
  printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
655
  printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
656
  printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
 
684
  printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
685
  printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
686
  printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
687
+ printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
688
  printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
689
  printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
690
  printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
691
  printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
692
  printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
693
+ printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
694
+ printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
695
+ printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
696
  if (llama_mlock_supported()) {
697
  printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
698
  }
 
710
  printf(" -ts SPLIT --tensor-split SPLIT\n");
711
  printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
712
  printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
 
713
  #ifdef GGML_USE_CUBLAS
714
  printf(" -nommq, --no-mul-mat-q\n");
715
  printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
716
  printf(" Not recommended since this is both slower and uses more VRAM.\n");
717
  #endif // GGML_USE_CUBLAS
718
  #endif
 
719
  printf(" --verbose-prompt print prompt before generation\n");
720
  fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
721
  printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
722
+ printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
723
  printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
724
  printf(" -m FNAME, --model FNAME\n");
725
  printf(" model path (default: %s)\n", params.model.c_str());
 
730
  printf("\n");
731
  }
732
 
733
+ std::string get_system_info(const gpt_params & params) {
734
+ std::ostringstream os;
735
+
736
+ os << "system_info: n_threads = " << params.n_threads;
737
+ if (params.n_threads_batch != -1) {
738
+ os << " (n_threads_batch = " << params.n_threads_batch << ")";
739
+ }
740
+ os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
741
+
742
+ return os.str();
743
+ }
744
+
745
  std::string gpt_random_prompt(std::mt19937 & rng) {
746
  const int r = rng() % 10;
747
  switch (r) {
 
755
  case 7: return "He";
756
  case 8: return "She";
757
  case 9: return "They";
 
758
  }
759
 
760
+ GGML_UNREACHABLE();
761
  }
762
 
763
  //
764
  // Model utils
765
  //
766
 
767
+ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
768
+ auto mparams = llama_model_default_params();
769
 
 
 
770
  if (params.n_gpu_layers != -1) {
771
+ mparams.n_gpu_layers = params.n_gpu_layers;
772
  }
773
+ mparams.main_gpu = params.main_gpu;
774
+ mparams.tensor_split = params.tensor_split;
775
+ mparams.use_mmap = params.use_mmap;
776
+ mparams.use_mlock = params.use_mlock;
777
+
778
+ return mparams;
779
+ }
780
+
781
+ struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
782
+ auto cparams = llama_context_default_params();
783
+
784
+ cparams.n_ctx = params.n_ctx;
785
+ cparams.n_batch = params.n_batch;
786
+ cparams.n_threads = params.n_threads;
787
+ cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
788
+ cparams.mul_mat_q = params.mul_mat_q;
789
+ cparams.seed = params.seed;
790
+ cparams.f16_kv = params.memory_f16;
791
+ cparams.logits_all = params.logits_all;
792
+ cparams.embedding = params.embedding;
793
+ cparams.rope_freq_base = params.rope_freq_base;
794
+ cparams.rope_freq_scale = params.rope_freq_scale;
795
+
796
+ return cparams;
797
  }
798
 
799
  std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
800
+ auto mparams = llama_model_params_from_gpt_params(params);
801
 
802
+ llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
803
  if (model == NULL) {
804
  fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
805
  return std::make_tuple(nullptr, nullptr);
806
  }
807
 
808
+ auto cparams = llama_context_params_from_gpt_params(params);
809
+
810
+ llama_context * lctx = llama_new_context_with_model(model, cparams);
811
  if (lctx == NULL) {
812
  fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
813
  llama_free_model(model);
814
  return std::make_tuple(nullptr, nullptr);
815
  }
816
 
817
+ for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
818
+ const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
819
+ float lora_scale = std::get<1>(params.lora_adapter[i]);
820
  int err = llama_model_apply_lora_from_file(model,
821
+ lora_adapter.c_str(),
822
+ lora_scale,
823
+ ((i > 0) || params.lora_base.empty())
824
+ ? NULL
825
+ : params.lora_base.c_str(),
826
  params.n_threads);
827
  if (err != 0) {
828
  fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
 
839
  {
840
  LOG("warming up the model with an empty run\n");
841
 
842
+ std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
843
+ llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
844
+ llama_kv_cache_tokens_rm(lctx, -1, -1);
845
  llama_reset_timings(lctx);
846
  }
847
 
 
853
  //
854
 
855
  std::vector<llama_token> llama_tokenize(
856
+ const struct llama_context * ctx,
857
+ const std::string & text,
858
+ bool add_bos) {
859
+ return llama_tokenize(llama_get_model(ctx), text, add_bos);
860
+ }
861
+
862
+ std::vector<llama_token> llama_tokenize(
863
+ const struct llama_model * model,
864
  const std::string & text,
865
  bool add_bos) {
866
  // upper limit for the number of tokens
867
  int n_tokens = text.length() + add_bos;
868
  std::vector<llama_token> result(n_tokens);
869
+ n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
870
  if (n_tokens < 0) {
871
  result.resize(-n_tokens);
872
+ int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
873
  GGML_ASSERT(check == -n_tokens);
874
  } else {
875
  result.resize(n_tokens);
 
879
 
880
  std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
881
  std::vector<char> result(8, 0);
882
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
883
  if (n_tokens < 0) {
884
  result.resize(-n_tokens);
885
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
886
  GGML_ASSERT(check == -n_tokens);
887
  } else {
888
  result.resize(n_tokens);
 
937
  std::vector<llama_token_data> & candidates,
938
  int idx) {
939
  const int n_ctx = llama_n_ctx(ctx);
940
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx));
941
 
942
  const float temp = params.temp;
943
  const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
 
955
 
956
  llama_token id = 0;
957
 
958
+ float * logits = llama_get_logits_ith(ctx, idx);
959
 
960
  // Apply params.logit_bias map
961
  for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
 
1006
  if (mirostat == 1) {
1007
  static float mirostat_mu = 2.0f * mirostat_tau;
1008
  const int mirostat_m = 100;
1009
+ llama_sample_temp(ctx, &cur_p, temp);
1010
  id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
1011
  } else if (mirostat == 2) {
1012
  static float mirostat_mu = 2.0f * mirostat_tau;
1013
+ llama_sample_temp(ctx, &cur_p, temp);
1014
  id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
1015
  } else {
1016
  // Temperature sampling
 
1018
  llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
1019
  llama_sample_typical (ctx, &cur_p, typical_p, 1);
1020
  llama_sample_top_p (ctx, &cur_p, top_p, 1);
1021
+ llama_sample_temp(ctx, &cur_p, temp);
1022
 
1023
  {
1024
  const int n_top = 10;
 
1223
  #endif // NDEBUG
1224
 
1225
  fprintf(stream, "model_desc: %s\n", model_desc);
1226
+ fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
1227
 
1228
  #ifdef __OPTIMIZE__
1229
  fprintf(stream, "optimize: true\n");
 
1247
  fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
1248
  fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
1249
  fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
 
1250
  fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
1251
  fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
1252
  dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
 
1275
  fprintf(stream, " %d: %f", lb.first, lb.second);
1276
  }
1277
 
1278
+ fprintf(stream, "lora:\n");
1279
+ for (std::tuple<std::string, float> la : params.lora_adapter) {
1280
+ if (std::get<1>(la) != 1.0f) {
1281
+ continue;
1282
+ }
1283
+ fprintf(stream, " - %s\n", std::get<0>(la).c_str());
1284
+ }
1285
+ fprintf(stream, "lora_scaled:\n");
1286
+ for (std::tuple<std::string, float> la : params.lora_adapter) {
1287
+ if (std::get<1>(la) == 1.0f) {
1288
+ continue;
1289
+ }
1290
+ fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
1291
+ }
1292
  fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
 
1293
  fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
1294
  fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
1295
  fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
 
1332
  fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
1333
  fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
1334
  fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
1335
+ fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
1336
  fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
1337
 
1338
  const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
common/common.h CHANGED
@@ -36,20 +36,23 @@ int32_t get_num_physical_cores();
36
  struct gpt_params {
37
  uint32_t seed = -1; // RNG seed
38
  int32_t n_threads = get_num_physical_cores();
 
39
  int32_t n_predict = -1; // new tokens to predict
40
  int32_t n_ctx = 512; // context size
41
  int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
42
  int32_t n_keep = 0; // number of tokens to keep from initial prompt
43
  int32_t n_draft = 16; // number of tokens to draft during speculative decoding
44
  int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
 
 
45
  int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
46
  int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
47
  int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
48
  float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
49
  int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
50
  int32_t n_beams = 0; // if non-zero then use beam search of given width.
51
- float rope_freq_base = 10000.0f; // RoPE base frequency
52
- float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
53
 
54
  // sampling parameters
55
  int32_t top_k = 40; // <= 0 to use vocab size
@@ -83,8 +86,8 @@ struct gpt_params {
83
  std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
84
  std::string logdir = ""; // directory in which to save YAML log files
85
 
86
- std::string lora_adapter = ""; // lora adapter path
87
- std::string lora_base = ""; // base model path for the lora adapter
88
 
89
  int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
90
  int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -93,7 +96,6 @@ struct gpt_params {
93
  bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
94
  size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
95
 
96
- bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
97
  bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
98
  bool memory_f16 = true; // use f16 instead of f32 for memory kv
99
  bool random_prompt = false; // do not randomize prompt if none provided
@@ -107,16 +109,16 @@ struct gpt_params {
107
  bool interactive_first = false; // wait for user input immediately
108
  bool multiline_input = false; // reverse the usage of `\`
109
  bool simple_io = false; // improves compatibility with subprocesses and limited consoles
 
110
 
111
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
112
  bool ignore_eos = false; // ignore generated EOS tokens
113
  bool instruct = false; // instruction mode (used for Alpaca models)
114
  bool penalize_nl = true; // consider newlines as a repeatable token
115
- bool perplexity = false; // compute perplexity over the prompt
116
  bool use_mmap = true; // use mmap for faster loads
117
  bool use_mlock = false; // use mlock to keep model in memory
118
  bool numa = false; // attempt optimizations that help on some NUMA systems
119
- bool export_cgraph = false; // export the computation graph
120
  bool verbose_prompt = false; // print prompt tokens before generation
121
  };
122
 
@@ -124,13 +126,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
124
 
125
  void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
126
 
 
 
127
  std::string gpt_random_prompt(std::mt19937 & rng);
128
 
 
 
129
  //
130
  // Model utils
131
  //
132
 
133
  std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
 
134
  struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
135
 
136
  //
@@ -140,7 +147,12 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
140
  // tokenizes a string into a vector of tokens
141
  // should work similar to Python's `tokenizer.encode`
142
  std::vector<llama_token> llama_tokenize(
143
- struct llama_context * ctx,
 
 
 
 
 
144
  const std::string & text,
145
  bool add_bos);
146
 
@@ -181,7 +193,7 @@ std::string llama_detokenize_bpe(
181
  // - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
182
  // - grammar: grammar to use for sampling, ignore if NULL
183
  // - last_tokens: needed for repetition penalty, ignore if empty
184
- // - idx: sample from llama_get_logits(ctx) + idx * n_vocab
185
  //
186
  // returns:
187
  // - token: sampled token
 
36
  struct gpt_params {
37
  uint32_t seed = -1; // RNG seed
38
  int32_t n_threads = get_num_physical_cores();
39
+ int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
40
  int32_t n_predict = -1; // new tokens to predict
41
  int32_t n_ctx = 512; // context size
42
  int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
43
  int32_t n_keep = 0; // number of tokens to keep from initial prompt
44
  int32_t n_draft = 16; // number of tokens to draft during speculative decoding
45
  int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
46
+ int32_t n_parallel = 1; // number of parallel sequences to decode
47
+ int32_t n_sequences = 1; // number of sequences to decode
48
  int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
49
  int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
50
  int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
51
  float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
52
  int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
53
  int32_t n_beams = 0; // if non-zero then use beam search of given width.
54
+ float rope_freq_base = 0.0f; // RoPE base frequency
55
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
56
 
57
  // sampling parameters
58
  int32_t top_k = 40; // <= 0 to use vocab size
 
86
  std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
87
  std::string logdir = ""; // directory in which to save YAML log files
88
 
89
+ std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
90
+ std::string lora_base = ""; // base model path for the lora adapter
91
 
92
  int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
93
  int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
 
96
  bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
97
  size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
98
 
 
99
  bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
100
  bool memory_f16 = true; // use f16 instead of f32 for memory kv
101
  bool random_prompt = false; // do not randomize prompt if none provided
 
109
  bool interactive_first = false; // wait for user input immediately
110
  bool multiline_input = false; // reverse the usage of `\`
111
  bool simple_io = false; // improves compatibility with subprocesses and limited consoles
112
+ bool cont_batching = false; // insert new sequences for decoding on-the-fly
113
 
114
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
115
  bool ignore_eos = false; // ignore generated EOS tokens
116
  bool instruct = false; // instruction mode (used for Alpaca models)
117
  bool penalize_nl = true; // consider newlines as a repeatable token
118
+ bool logits_all = false; // return logits for all tokens in the batch
119
  bool use_mmap = true; // use mmap for faster loads
120
  bool use_mlock = false; // use mlock to keep model in memory
121
  bool numa = false; // attempt optimizations that help on some NUMA systems
 
122
  bool verbose_prompt = false; // print prompt tokens before generation
123
  };
124
 
 
126
 
127
  void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
128
 
129
+ std::string get_system_info(const gpt_params & params);
130
+
131
  std::string gpt_random_prompt(std::mt19937 & rng);
132
 
133
+ void process_escapes(std::string& input);
134
+
135
  //
136
  // Model utils
137
  //
138
 
139
  std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
140
+ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params);
141
  struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
142
 
143
  //
 
147
  // tokenizes a string into a vector of tokens
148
  // should work similar to Python's `tokenizer.encode`
149
  std::vector<llama_token> llama_tokenize(
150
+ const struct llama_context * ctx,
151
+ const std::string & text,
152
+ bool add_bos);
153
+
154
+ std::vector<llama_token> llama_tokenize(
155
+ const struct llama_model * model,
156
  const std::string & text,
157
  bool add_bos);
158
 
 
193
  // - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
194
  // - grammar: grammar to use for sampling, ignore if NULL
195
  // - last_tokens: needed for repetition penalty, ignore if empty
196
+ // - idx: sample from llama_get_logits_ith(ctx, idx)
197
  //
198
  // returns:
199
  // - token: sampled token
common/log.h CHANGED
@@ -225,31 +225,31 @@ enum LogTriState
225
  // USE LOG() INSTEAD
226
  //
227
  #ifndef _MSC_VER
228
- #define LOG_IMPL(str, ...) \
229
- { \
230
  if (LOG_TARGET != nullptr) \
231
  { \
232
  fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
233
  fflush(LOG_TARGET); \
234
  } \
235
- }
236
  #else
237
- #define LOG_IMPL(str, ...) \
238
- { \
239
  if (LOG_TARGET != nullptr) \
240
  { \
241
  fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
242
  fflush(LOG_TARGET); \
243
  } \
244
- }
245
  #endif
246
 
247
  // INTERNAL, DO NOT USE
248
  // USE LOG_TEE() INSTEAD
249
  //
250
  #ifndef _MSC_VER
251
- #define LOG_TEE_IMPL(str, ...) \
252
- { \
253
  if (LOG_TARGET != nullptr) \
254
  { \
255
  fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
@@ -260,10 +260,10 @@ enum LogTriState
260
  fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
261
  fflush(LOG_TEE_TARGET); \
262
  } \
263
- }
264
  #else
265
- #define LOG_TEE_IMPL(str, ...) \
266
- { \
267
  if (LOG_TARGET != nullptr) \
268
  { \
269
  fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
@@ -274,7 +274,7 @@ enum LogTriState
274
  fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
275
  fflush(LOG_TEE_TARGET); \
276
  } \
277
- }
278
  #endif
279
 
280
  // The '\0' as a last argument, is a trick to bypass the silly
@@ -435,41 +435,41 @@ inline FILE *log_handler() { return log_handler1_impl(); }
435
  inline void log_test()
436
  {
437
  log_disable();
438
- LOG("01 Hello World to nobody, because logs are disabled!\n")
439
  log_enable();
440
- LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET))
441
- LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n")
442
  log_set_target(stderr);
443
- LOG("04 Hello World to stderr!\n")
444
- LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n")
445
  log_set_target(LOG_DEFAULT_FILE_NAME);
446
- LOG("06 Hello World to default log file!\n")
447
  log_set_target(stdout);
448
- LOG("07 Hello World to stdout!\n")
449
  log_set_target(LOG_DEFAULT_FILE_NAME);
450
- LOG("08 Hello World to default log file again!\n")
451
  log_disable();
452
- LOG("09 Hello World _1_ into the void!\n")
453
  log_enable();
454
- LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n")
455
  log_disable();
456
  log_set_target("llama.anotherlog.log");
457
- LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n")
458
  log_enable();
459
- LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n")
460
  log_set_target("llama.yetanotherlog.log");
461
- LOG("13 Hello World this time in yet new file?\n")
462
  log_set_target(log_filename_generator("llama_autonamed", "log"));
463
- LOG("14 Hello World in log with generated filename!\n")
464
  #ifdef _MSC_VER
465
- LOG_TEE("15 Hello msvc TEE without arguments\n")
466
- LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test")
467
- LOG_TEELN("17 Hello msvc TEELN without arguments\n")
468
- LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test")
469
- LOG("19 Hello msvc LOG without arguments\n")
470
- LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test")
471
- LOGLN("21 Hello msvc LOGLN without arguments\n")
472
- LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test")
473
  #endif
474
  }
475
 
@@ -542,7 +542,7 @@ inline void log_dump_cmdline_impl(int argc, char **argv)
542
  buf << " " << argv[i];
543
  }
544
  }
545
- LOGLN("Cmd:%s", buf.str().c_str())
546
  }
547
 
548
  #define log_tostr(var) log_var_to_string_impl(var).c_str()
@@ -620,10 +620,10 @@ inline std::string log_var_to_string_impl(const std::vector<int> & var)
620
  #define LOGLN(...) // dummy stub
621
 
622
  #undef LOG_TEE
623
- #define LOG_TEE(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
624
 
625
  #undef LOG_TEELN
626
- #define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
627
 
628
  #undef LOG_DISABLE
629
  #define LOG_DISABLE() // dummy stub
 
225
  // USE LOG() INSTEAD
226
  //
227
  #ifndef _MSC_VER
228
+ #define LOG_IMPL(str, ...) \
229
+ do { \
230
  if (LOG_TARGET != nullptr) \
231
  { \
232
  fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
233
  fflush(LOG_TARGET); \
234
  } \
235
+ } while (0)
236
  #else
237
+ #define LOG_IMPL(str, ...) \
238
+ do { \
239
  if (LOG_TARGET != nullptr) \
240
  { \
241
  fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
242
  fflush(LOG_TARGET); \
243
  } \
244
+ } while (0)
245
  #endif
246
 
247
  // INTERNAL, DO NOT USE
248
  // USE LOG_TEE() INSTEAD
249
  //
250
  #ifndef _MSC_VER
251
+ #define LOG_TEE_IMPL(str, ...) \
252
+ do { \
253
  if (LOG_TARGET != nullptr) \
254
  { \
255
  fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
 
260
  fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
261
  fflush(LOG_TEE_TARGET); \
262
  } \
263
+ } while (0)
264
  #else
265
+ #define LOG_TEE_IMPL(str, ...) \
266
+ do { \
267
  if (LOG_TARGET != nullptr) \
268
  { \
269
  fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
 
274
  fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
275
  fflush(LOG_TEE_TARGET); \
276
  } \
277
+ } while (0)
278
  #endif
279
 
280
  // The '\0' as a last argument, is a trick to bypass the silly
 
435
  inline void log_test()
436
  {
437
  log_disable();
438
+ LOG("01 Hello World to nobody, because logs are disabled!\n");
439
  log_enable();
440
+ LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
441
+ LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
442
  log_set_target(stderr);
443
+ LOG("04 Hello World to stderr!\n");
444
+ LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
445
  log_set_target(LOG_DEFAULT_FILE_NAME);
446
+ LOG("06 Hello World to default log file!\n");
447
  log_set_target(stdout);
448
+ LOG("07 Hello World to stdout!\n");
449
  log_set_target(LOG_DEFAULT_FILE_NAME);
450
+ LOG("08 Hello World to default log file again!\n");
451
  log_disable();
452
+ LOG("09 Hello World _1_ into the void!\n");
453
  log_enable();
454
+ LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
455
  log_disable();
456
  log_set_target("llama.anotherlog.log");
457
+ LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
458
  log_enable();
459
+ LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
460
  log_set_target("llama.yetanotherlog.log");
461
+ LOG("13 Hello World this time in yet new file?\n");
462
  log_set_target(log_filename_generator("llama_autonamed", "log"));
463
+ LOG("14 Hello World in log with generated filename!\n");
464
  #ifdef _MSC_VER
465
+ LOG_TEE("15 Hello msvc TEE without arguments\n");
466
+ LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
467
+ LOG_TEELN("17 Hello msvc TEELN without arguments\n");
468
+ LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
469
+ LOG("19 Hello msvc LOG without arguments\n");
470
+ LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
471
+ LOGLN("21 Hello msvc LOGLN without arguments\n");
472
+ LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
473
  #endif
474
  }
475
 
 
542
  buf << " " << argv[i];
543
  }
544
  }
545
+ LOGLN("Cmd:%s", buf.str().c_str());
546
  }
547
 
548
  #define log_tostr(var) log_var_to_string_impl(var).c_str()
 
620
  #define LOGLN(...) // dummy stub
621
 
622
  #undef LOG_TEE
623
+ #define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
624
 
625
  #undef LOG_TEELN
626
+ #define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
627
 
628
  #undef LOG_DISABLE
629
  #define LOG_DISABLE() // dummy stub
common/train.cpp ADDED
@@ -0,0 +1,1496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "train.h"
2
+ #include "common.h"
3
+
4
+ #include <random>
5
+ #include <sstream>
6
+ #include <functional>
7
+
8
+ struct random_normal_distribution {
9
+ std::mt19937 gen;
10
+ std::normal_distribution<float> rd;
11
+ float min;
12
+ float max;
13
+ };
14
+
15
+ struct random_uniform_distribution {
16
+ std::mt19937 gen;
17
+ std::uniform_real_distribution<float> rd;
18
+ };
19
+
20
+ struct train_state * init_train_state() {
21
+ struct train_state * state = new struct train_state;
22
+ state->train_its = 0;
23
+ state->train_samples = 0;
24
+ state->train_tokens = 0;
25
+ state->train_epochs = 0;
26
+ state->shuffle_samples_hash = 0;
27
+ state->shuffle_sample_count = 0;
28
+ state->shuffle_next_sample = 0;
29
+ state->shuffle_rng_state_current = "";
30
+ state->shuffle_rng_state_next = "";
31
+
32
+ state->opt = new struct ggml_opt_context;
33
+ state->opt->ctx = NULL;
34
+ state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
35
+ state->opt->loss_after = 0.0f;
36
+
37
+ return state;
38
+ }
39
+
40
+ void free_train_state(struct train_state * state) {
41
+ delete state->opt;
42
+ delete state;
43
+ }
44
+
45
+ struct random_normal_distribution * init_random_normal_distribution(
46
+ int seed, float mean, float std, float min, float max
47
+ ) {
48
+ struct random_normal_distribution * rnd = (struct random_normal_distribution *) malloc(sizeof(struct random_normal_distribution));
49
+ rnd->gen = std::mt19937(seed);
50
+ rnd->rd = std::normal_distribution<float>{mean, std};
51
+ rnd->min = min;
52
+ rnd->max = max;
53
+ return rnd;
54
+ }
55
+
56
+ struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max) {
57
+ struct random_uniform_distribution * rnd = (struct random_uniform_distribution *) malloc(sizeof(struct random_uniform_distribution));
58
+ rnd->gen = std::mt19937(seed);
59
+ rnd->rd = std::uniform_real_distribution<float>{min, max};
60
+ return rnd;
61
+ }
62
+
63
+ void free_random_normal_distribution (struct random_normal_distribution * rnd) {
64
+ free(rnd);
65
+ }
66
+
67
+ void free_random_uniform_distribution(struct random_uniform_distribution * rnd) {
68
+ free(rnd);
69
+ }
70
+
71
+ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
72
+ float scale = 1.0f; // xavier
73
+ switch (tensor->n_dims) {
74
+ case 1:
75
+ scale /= sqrtf((float) tensor->ne[0]);
76
+ for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
77
+ float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
78
+ *dst = scale * frand_normal(rnd);
79
+ }
80
+ break;
81
+ case 2:
82
+ scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]);
83
+ for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
84
+ for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
85
+ float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
86
+ *dst = scale * frand_normal(rnd);
87
+ }
88
+ }
89
+ break;
90
+ case 3:
91
+ scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]);
92
+ for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
93
+ for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
94
+ for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
95
+ float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
96
+ *dst = scale * frand_normal(rnd);
97
+ }
98
+ }
99
+ }
100
+ break;
101
+ case 4:
102
+ scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]);
103
+ for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
104
+ for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
105
+ for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
106
+ for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
107
+ float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
108
+ *dst = scale * frand_normal(rnd);
109
+ }
110
+ }
111
+ }
112
+ }
113
+ break;
114
+ default:
115
+ die("Unsupported tensor->n_dims");
116
+ };
117
+ return tensor;
118
+ }
119
+
120
+ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
121
+ switch (tensor->n_dims) {
122
+ case 1:
123
+ for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
124
+ float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
125
+ *dst = frand_uniform(rnd);
126
+ }
127
+ break;
128
+ case 2:
129
+ for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
130
+ for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
131
+ float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
132
+ *dst = frand_uniform(rnd);
133
+ }
134
+ }
135
+ break;
136
+ case 3:
137
+ for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
138
+ for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
139
+ for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
140
+ float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
141
+ *dst = frand_uniform(rnd);
142
+ }
143
+ }
144
+ }
145
+ break;
146
+ case 4:
147
+ for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
148
+ for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
149
+ for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
150
+ for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
151
+ float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
152
+ *dst = frand_uniform(rnd);
153
+ }
154
+ }
155
+ }
156
+ }
157
+ break;
158
+ default:
159
+ die("Unsupported tensor->n_dims");
160
+ };
161
+ return tensor;
162
+ }
163
+
164
+ float frand() {
165
+ return (float)rand()/((float)(RAND_MAX) + 1.0f);
166
+ }
167
+
168
+ float frand_normal(struct random_normal_distribution * rnd) {
169
+ return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max);
170
+ }
171
+
172
+ float frand_uniform(struct random_uniform_distribution * rnd) {
173
+ return rnd->rd(rnd->gen);
174
+ }
175
+
176
+ int clamp(const int v, const int min, const int max) {
177
+ return ((v < min) ? (min) : (v > max) ? (max) : v);
178
+ }
179
+
180
+ float fclamp(const float v, const float min, const float max) {
181
+ return ((v < min) ? (min) : (v > max) ? (max) : v);
182
+ }
183
+
184
+ void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
185
+ GGML_ASSERT(tensor->n_dims == 1);
186
+ GGML_ASSERT(tensor->ne[0] == ne0);
187
+ }
188
+
189
+ void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
190
+ GGML_ASSERT(tensor->n_dims == 2);
191
+ GGML_ASSERT(tensor->ne[0] == ne0);
192
+ GGML_ASSERT(tensor->ne[1] == ne1);
193
+ }
194
+
195
+ void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
196
+ GGML_ASSERT(tensor->n_dims == 3);
197
+ GGML_ASSERT(tensor->ne[0] == ne0);
198
+ GGML_ASSERT(tensor->ne[1] == ne1);
199
+ GGML_ASSERT(tensor->ne[2] == ne2);
200
+ }
201
+
202
+ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
203
+ GGML_ASSERT(tensor->n_dims == 4);
204
+ GGML_ASSERT(tensor->ne[0] == ne0);
205
+ GGML_ASSERT(tensor->ne[1] == ne1);
206
+ GGML_ASSERT(tensor->ne[2] == ne2);
207
+ GGML_ASSERT(tensor->ne[3] == ne3);
208
+ }
209
+
210
+ int64_t get_example_targets_batch(
211
+ struct llama_context * lctx,
212
+ struct ggml_tensor * tokens_input,
213
+ struct ggml_tensor * target_probs,
214
+ int64_t example_id,
215
+ const size_t * samples_offs,
216
+ const size_t * samples_begin,
217
+ const size_t * samples_size,
218
+ size_t samples_count,
219
+ const llama_token * train_data,
220
+ size_t n_train_data,
221
+ bool separate_with_eos,
222
+ bool separate_with_bos,
223
+ bool fill_with_next_samples,
224
+ bool sample_random_offsets
225
+ ) {
226
+ GGML_ASSERT(samples_count > 0);
227
+ GGML_ASSERT(tokens_input->n_dims == 2);
228
+ GGML_ASSERT(target_probs->n_dims == 3);
229
+ int64_t n_vocab = target_probs->ne[0];
230
+ int64_t n_tokens = tokens_input->ne[0];
231
+ int64_t n_batch = tokens_input->ne[1];
232
+ GGML_ASSERT(n_vocab == target_probs->ne[0]);
233
+ GGML_ASSERT(n_tokens == target_probs->ne[1]);
234
+ GGML_ASSERT(n_batch == target_probs->ne[2]);
235
+
236
+ int64_t used_samples = 0;
237
+
238
+ ggml_set_f32(target_probs, 0.0f);
239
+ llama_token bos = llama_token_bos(lctx);
240
+ llama_token eos = llama_token_eos(lctx);
241
+ // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
242
+ for (int k=0; k<n_batch; ++k) {
243
+ // printf("%s: batch %d\n", __func__, k);
244
+ size_t sample_idx = (example_id + used_samples) % samples_count;
245
+ size_t sample_offs = sample_random_offsets ? samples_offs[sample_idx] : 0;
246
+ size_t sample_begin = samples_begin[sample_idx];
247
+ size_t sample_size = samples_size[sample_idx];
248
+ ++used_samples;
249
+
250
+ // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample);
251
+ GGML_ASSERT(sample_begin+sample_size-1 < n_train_data);
252
+
253
+ ggml_set_i32_nd(tokens_input, 0, k, 0, 0, bos);
254
+ bool sample_separation_eos = !separate_with_eos;
255
+ bool sample_separation_bos = !separate_with_bos;
256
+ for (int64_t i=0; i<n_tokens; ++i) {
257
+ llama_token token = eos;
258
+ if (sample_offs >= sample_size && fill_with_next_samples) {
259
+ if (!sample_separation_eos) {
260
+ // insert eos token to separate samples
261
+ sample_separation_eos = true;
262
+ } else if (!sample_separation_bos) {
263
+ // insert bos token to separate samples
264
+ sample_separation_bos = true;
265
+ token = bos;
266
+ } else {
267
+ // sample separation is done, continue with next sample
268
+ sample_separation_eos = !separate_with_eos;
269
+ sample_separation_bos = !separate_with_bos;
270
+ sample_offs = 0;
271
+ sample_idx = (example_id + used_samples) % samples_count;
272
+ sample_begin = samples_begin[sample_idx];
273
+ sample_size = samples_size[sample_idx];
274
+ ++used_samples;
275
+ }
276
+ }
277
+ // note: no else-if here
278
+ if (sample_offs < sample_size) {
279
+ token = clamp(train_data[sample_begin+sample_offs], 0, (llama_token) (n_vocab - 1));
280
+ ++sample_offs;
281
+ }
282
+ ggml_set_f32_nd(target_probs, token, (int) i, (int) k, 0, +1.0f);
283
+ if (i+1<n_tokens) {
284
+ ggml_set_i32_nd(tokens_input, (int) (i + 1), (int) k, 0, 0, token);
285
+ }
286
+ }
287
+ }
288
+
289
+ return used_samples;
290
+ }
291
+
292
+ void mt19937_set_state(std::mt19937& rng, const std::string& rng_state) {
293
+ std::stringstream s_rng_state;
294
+ s_rng_state.imbue(std::locale::classic());
295
+ s_rng_state.exceptions(std::stringstream::failbit);
296
+ s_rng_state.str(rng_state);
297
+ s_rng_state >> rng;
298
+ }
299
+
300
+ std::string mt19937_get_state(const std::mt19937& rng) {
301
+ std::stringstream s_rng_state;
302
+ s_rng_state.imbue(std::locale::classic());
303
+ s_rng_state << rng;
304
+ return s_rng_state.str();
305
+ }
306
+
307
+ std::string mt19937_seed_to_state(unsigned seed) {
308
+ std::mt19937 rng(seed);
309
+ return mt19937_get_state(rng);
310
+ }
311
+
312
+ std::string shuffle_samples(
313
+ const std::string & rng_state,
314
+ size_t * shuffled_offs,
315
+ size_t * shuffled_begins,
316
+ size_t * shuffled_sizes,
317
+ const size_t * begins,
318
+ const size_t * sizes,
319
+ size_t count) {
320
+ if (count == 0) return rng_state;
321
+
322
+ std::mt19937 rng;
323
+ mt19937_set_state(rng, rng_state);
324
+
325
+ // sort indices by random value for each index
326
+ std::vector<size_t> idcs;
327
+ {
328
+ std::vector<unsigned> rnd;
329
+ idcs.resize(count);
330
+ rnd.resize(count);
331
+ for (unsigned i=0; i<count; ++i) {
332
+ idcs[i] = i;
333
+ rnd[i] = rng();
334
+ }
335
+
336
+ std::sort(idcs.begin(), idcs.end(), [&rnd](size_t a, size_t b){
337
+ // stable sort for reproducibility
338
+ return (rnd[a] == rnd[b]) ? (a < b) : (rnd[a] < rnd[b]);
339
+ });
340
+ }
341
+
342
+ // create random offsets
343
+ for (unsigned i=0; i<count; ++i) {
344
+ shuffled_offs[i] = (size_t) ((sizes[idcs[i]] - 1) * ((double) rng() / (double) (rng.max()-1)));
345
+ }
346
+
347
+ // reorder begins and sizes by sorted indices
348
+ for (unsigned i=0; i<count; ++i) {
349
+ shuffled_begins[i] = begins[idcs[i]];
350
+ }
351
+
352
+ for (unsigned i=0; i<count; ++i) {
353
+ shuffled_sizes[i] = sizes[idcs[i]];
354
+ }
355
+
356
+ return mt19937_get_state(rng);
357
+ }
358
+
359
+ size_t hash_combine(size_t h1, size_t h2) {
360
+ return h1 ^ (h2 << 1);
361
+ }
362
+
363
+ size_t compute_samples_hash(const char* fn, const size_t* samples_begin, const size_t* samples_size, size_t sample_count) {
364
+ std::hash<std::string> h_string;
365
+ std::hash<unsigned long long> h_ull;
366
+ size_t h = h_string(std::string(fn));
367
+ h = hash_combine(h, h_ull((unsigned long long) sample_count));
368
+ for (size_t i=0; i< sample_count; ++i) {
369
+ h = hash_combine(h, h_ull((unsigned long long) samples_begin[i]));
370
+ h = hash_combine(h, h_ull((unsigned long long) samples_size[i]));
371
+ }
372
+ return h;
373
+ }
374
+
375
+ std::string replace_str(const char * s, const char * needle, const char * replacement) {
376
+ std::string str = s;
377
+ size_t pos = str.find(needle);
378
+ if (pos != std::string::npos) {
379
+ str.replace(pos, strlen(needle), replacement);
380
+ }
381
+ return str;
382
+ }
383
+
384
+ void print_duration(double fmillis) {
385
+ if (fmillis < 1000.0f) {
386
+ printf("%.1fms", (float) fmillis);
387
+ return;
388
+ }
389
+ const int64_t one_sec = 1000;
390
+ const int64_t one_min = one_sec * 60;
391
+ const int64_t one_hour = one_min * 60;
392
+ const int64_t one_day = one_hour * 24;
393
+
394
+ int64_t millis = (int64_t) fmillis;
395
+ int64_t days = millis/one_day;
396
+ int64_t hours = (millis - days*one_day)/one_hour;
397
+ int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min;
398
+ int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec;
399
+
400
+ // to print int64_t either cast to (long long int) or use macro PRId64 from <inttypes.h>
401
+ if (days > 0) {
402
+ printf("%lldd ", (long long int) days);
403
+ }
404
+ printf("%02lld:%02lld:%02lld", (long long int) hours, (long long int) minutes, (long long int) seconds);
405
+ }
406
+
407
+ float cosine_decay(int64_t step, int64_t decay_steps, float minimum) {
408
+ if (step > decay_steps) {
409
+ step = decay_steps;
410
+ }
411
+ const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps));
412
+ const float decay = (1 - minimum)*cosine_decay + minimum;
413
+ return decay;
414
+ }
415
+
416
+ float cosine_decay_restart(int64_t step, int64_t decay_steps, float minimum, float restart_step_mult) {
417
+ while (step > decay_steps) {
418
+ step -= decay_steps;
419
+ decay_steps = (int64_t) (restart_step_mult * decay_steps);
420
+ }
421
+ return cosine_decay(step, decay_steps, minimum);
422
+ }
423
+
424
+ float learning_schedule(
425
+ int64_t step,
426
+ int64_t warmup_steps,
427
+ int64_t cos_decay_steps,
428
+ float learning_rate,
429
+ float overall_minimum,
430
+ float cos_decay_minimum,
431
+ float cos_decay_restart_step_mult,
432
+ bool enable_restart) {
433
+
434
+ float result =
435
+ (step < warmup_steps)
436
+ ? (float) step / (float) warmup_steps
437
+ : enable_restart
438
+ ? cosine_decay_restart(
439
+ step - warmup_steps,
440
+ cos_decay_steps,
441
+ cos_decay_minimum,
442
+ cos_decay_restart_step_mult)
443
+ : cosine_decay(
444
+ step,
445
+ cos_decay_steps,
446
+ cos_decay_minimum);
447
+
448
+ float min = overall_minimum / learning_rate;
449
+ result = min + result * (1.0f - min);
450
+ return result;
451
+ }
452
+
453
+ static bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
454
+ GGML_ASSERT(a != NULL);
455
+ GGML_ASSERT(b != NULL);
456
+ GGML_ASSERT(a->type == b->type);
457
+ GGML_ASSERT(ggml_are_same_shape(a, b));
458
+ GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b));
459
+
460
+ return true;
461
+ }
462
+
463
+ void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) {
464
+ if (dst == NULL) {
465
+ return;
466
+ }
467
+ struct ggml_tensor * t = ggml_get_tensor(ctx, name);
468
+ GGML_ASSERT(are_same_layout(dst, t));
469
+ memcpy(dst->data, t->data, ggml_nbytes(t));
470
+
471
+ if (strlen(ggml_get_name(dst)) == 0) {
472
+ ggml_set_name(dst, name);
473
+ }
474
+ }
475
+
476
+ // gguf constants
477
+ static const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type";
478
+ static const char * LLM_KV_OPTIMIZER_TYPE_ADAM = "adam";
479
+ static const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs";
480
+ static const char * LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version";
481
+ static const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count";
482
+ static const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count";
483
+ static const char * LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count";
484
+ static const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized";
485
+ static const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss";
486
+ static const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss";
487
+ static const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count";
488
+ static const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count";
489
+ static const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss";
490
+ static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step";
491
+ static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j";
492
+ static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k";
493
+ static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end";
494
+ static const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count";
495
+
496
+ static const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments";
497
+ static const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments";
498
+ static const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values";
499
+
500
+ static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters";
501
+ static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters";
502
+ static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients";
503
+ static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients";
504
+ static const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction";
505
+ static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values";
506
+ static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha";
507
+ static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys";
508
+ static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s";
509
+ static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y";
510
+
511
+ static const char * LLM_KV_TRAINING_FILE_VERSION = "training.file_version";
512
+ static const char * LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count";
513
+ static const char * LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count";
514
+ static const char * LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count";
515
+ static const char * LLM_KV_TRAINING_EPOCH_COUNT = "training.epoch_count";
516
+ static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH = "training.shuffle.samples_hash";
517
+ static const char * LLM_KV_TRAINING_SHUFFLE_RNG_STATE = "training.shuffle.rng_state";
518
+ static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT = "training.shuffle.sample_count";
519
+ static const char * LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE = "training.shuffle.next_sample";
520
+
521
+ #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
522
+ { \
523
+ const std::string skey(key); \
524
+ const int kid = gguf_find_key(ctx, skey.c_str()); \
525
+ if (kid >= 0) { \
526
+ enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
527
+ if (ktype != (type)) { \
528
+ die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
529
+ } \
530
+ (dst) = func(ctx, kid); \
531
+ } else if (req) { \
532
+ die_fmt("key not found in model: %s", skey.c_str()); \
533
+ } \
534
+ }
535
+
536
+ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) {
537
+ // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
538
+
539
+ uint32_t file_version;
540
+ GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION);
541
+ GGML_ASSERT(file_version == 0);
542
+
543
+ GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT);
544
+ GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT);
545
+ GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED);
546
+
547
+ uint64_t nx;
548
+ GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT);
549
+ opt->nx = (size_t) nx;
550
+
551
+ // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know
552
+
553
+ std::string opt_type;
554
+ GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
555
+ if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
556
+ opt->params.type = GGML_OPT_ADAM;
557
+
558
+ GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
559
+ GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
560
+ GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT);
561
+
562
+ ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
563
+
564
+ copy_tensor_by_name(opt->adam.m, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
565
+ copy_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
566
+ copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
567
+ } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
568
+ opt->params.type = GGML_OPT_LBFGS;
569
+
570
+ GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
571
+ GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
572
+ GGUF_GET_KEY(fctx, opt->lbfgs.step, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP);
573
+ GGUF_GET_KEY(fctx, opt->lbfgs.j, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J);
574
+ GGUF_GET_KEY(fctx, opt->lbfgs.k, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K);
575
+ GGUF_GET_KEY(fctx, opt->lbfgs.end, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END);
576
+ GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT);
577
+
578
+ ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
579
+
580
+ copy_tensor_by_name(opt->lbfgs.x, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
581
+ copy_tensor_by_name(opt->lbfgs.xp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
582
+ copy_tensor_by_name(opt->lbfgs.g, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
583
+ copy_tensor_by_name(opt->lbfgs.gp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
584
+ copy_tensor_by_name(opt->lbfgs.d, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
585
+ copy_tensor_by_name(opt->lbfgs.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
586
+ copy_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
587
+ copy_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
588
+ copy_tensor_by_name(opt->lbfgs.lms, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
589
+ copy_tensor_by_name(opt->lbfgs.lmy, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
590
+ } else {
591
+ die("unknown optimizer type\n");
592
+ }
593
+ }
594
+
595
+ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) {
596
+ gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0);
597
+ gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past);
598
+ gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx);
599
+ gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter);
600
+ gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
601
+
602
+ switch (opt->params.type) {
603
+ case GGML_OPT_ADAM:
604
+ {
605
+ gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
606
+ gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best);
607
+ gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, opt->adam.fx_prev);
608
+ gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement);
609
+
610
+ ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
611
+ ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
612
+ if (opt->adam.pf) {
613
+ ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
614
+ }
615
+
616
+ gguf_add_tensor(fctx, opt->adam.m);
617
+ gguf_add_tensor(fctx, opt->adam.v);
618
+ if (opt->adam.pf) {
619
+ gguf_add_tensor(fctx, opt->adam.pf);
620
+ }
621
+ } break;
622
+ case GGML_OPT_LBFGS:
623
+ {
624
+ gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
625
+ gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);
626
+ gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, opt->lbfgs.fx_best);
627
+ gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, opt->lbfgs.step);
628
+ gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, opt->lbfgs.j);
629
+ gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, opt->lbfgs.k);
630
+ gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, opt->lbfgs.end);
631
+ gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement);
632
+
633
+ ggml_set_name(opt->lbfgs.x, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
634
+ ggml_set_name(opt->lbfgs.xp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
635
+ ggml_set_name(opt->lbfgs.g, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
636
+ ggml_set_name(opt->lbfgs.gp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
637
+ ggml_set_name(opt->lbfgs.d, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
638
+ if (opt->lbfgs.pf) {
639
+ ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
640
+ }
641
+ ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
642
+ ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
643
+ ggml_set_name(opt->lbfgs.lms, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
644
+ ggml_set_name(opt->lbfgs.lmy, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
645
+
646
+ gguf_add_tensor(fctx, opt->lbfgs.x);
647
+ gguf_add_tensor(fctx, opt->lbfgs.xp);
648
+ gguf_add_tensor(fctx, opt->lbfgs.g);
649
+ gguf_add_tensor(fctx, opt->lbfgs.gp);
650
+ gguf_add_tensor(fctx, opt->lbfgs.d);
651
+ if (opt->lbfgs.pf) {
652
+ gguf_add_tensor(fctx, opt->lbfgs.pf);
653
+ }
654
+ gguf_add_tensor(fctx, opt->lbfgs.lmal);
655
+ gguf_add_tensor(fctx, opt->lbfgs.lmys);
656
+ gguf_add_tensor(fctx, opt->lbfgs.lms);
657
+ gguf_add_tensor(fctx, opt->lbfgs.lmy);
658
+ } break;
659
+ }
660
+ }
661
+
662
+ bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train) {
663
+ if (gguf_find_key(fctx, LLM_KV_TRAINING_FILE_VERSION) < 0) {
664
+ return false;
665
+ }
666
+
667
+ uint32_t file_version;
668
+ GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION);
669
+ GGML_ASSERT(file_version <= 1);
670
+
671
+ if (file_version == 0) {
672
+
673
+ GGUF_GET_KEY(fctx, train->train_its, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
674
+ GGUF_GET_KEY(fctx, train->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT);
675
+ GGUF_GET_KEY(fctx, train->train_tokens, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT);
676
+
677
+ } else if (file_version == 1) {
678
+
679
+ GGUF_GET_KEY(fctx, train->train_its, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_ITERATION_COUNT);
680
+ GGUF_GET_KEY(fctx, train->train_samples, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_SAMPLE_COUNT);
681
+ GGUF_GET_KEY(fctx, train->train_tokens, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_TOKEN_COUNT);
682
+ GGUF_GET_KEY(fctx, train->train_epochs, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_EPOCH_COUNT);
683
+
684
+ GGUF_GET_KEY(fctx, train->shuffle_samples_hash, gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH);
685
+ GGUF_GET_KEY(fctx, train->shuffle_rng_state_current, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_SHUFFLE_RNG_STATE);
686
+ GGUF_GET_KEY(fctx, train->shuffle_sample_count, gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT);
687
+ GGUF_GET_KEY(fctx, train->shuffle_next_sample, gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE);
688
+ }
689
+
690
+ load_opt_context_gguf(fctx, f_ggml_ctx, train->opt);
691
+ return true;
692
+ }
693
+
694
+ void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train) {
695
+ gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION, 1);
696
+ gguf_set_val_u64(fctx, LLM_KV_TRAINING_ITERATION_COUNT, train->train_its);
697
+ gguf_set_val_u64(fctx, LLM_KV_TRAINING_SAMPLE_COUNT, train->train_samples);
698
+ gguf_set_val_u64(fctx, LLM_KV_TRAINING_TOKEN_COUNT, train->train_tokens);
699
+ gguf_set_val_u64(fctx, LLM_KV_TRAINING_EPOCH_COUNT, train->train_epochs);
700
+
701
+ gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH, (uint64_t) train->shuffle_samples_hash);
702
+ gguf_set_val_str(fctx, LLM_KV_TRAINING_SHUFFLE_RNG_STATE, train->shuffle_rng_state_current.c_str());
703
+ gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT, (uint64_t) train->shuffle_sample_count);
704
+ gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE, (uint64_t) train->shuffle_next_sample);
705
+
706
+ save_opt_context_gguf(fctx, train->opt);
707
+ }
708
+
709
+
710
+ struct llama_file {
711
+ // use FILE * so we don't have to re-open the file to mmap
712
+ FILE * fp;
713
+ size_t size;
714
+
715
+ llama_file(const char * fname, const char * mode) {
716
+ fp = std::fopen(fname, mode);
717
+ if (fp == NULL) {
718
+ size = 0;
719
+ } else {
720
+ seek(0, SEEK_END);
721
+ size = tell();
722
+ seek(0, SEEK_SET);
723
+ }
724
+ }
725
+
726
+ size_t tell() const {
727
+ #ifdef _WIN32
728
+ __int64 ret = _ftelli64(fp);
729
+ #else
730
+ long ret = std::ftell(fp);
731
+ #endif
732
+ GGML_ASSERT(ret != -1); // this really shouldn't fail
733
+ return (size_t) ret;
734
+ }
735
+
736
+ void seek(size_t offset, int whence) {
737
+ #ifdef _WIN32
738
+ int ret = _fseeki64(fp, (__int64) offset, whence);
739
+ #else
740
+ int ret = std::fseek(fp, (long) offset, whence);
741
+ #endif
742
+ GGML_ASSERT(ret == 0); // same
743
+ }
744
+
745
+ void read_raw(void * ptr, size_t size) {
746
+ if (size == 0) {
747
+ return;
748
+ }
749
+ errno = 0;
750
+ std::size_t ret = std::fread(ptr, size, 1, fp);
751
+ if (ferror(fp)) {
752
+ die_fmt("read error: %s", strerror(errno));
753
+ }
754
+ if (ret != 1) {
755
+ die("unexpectedly reached end of file");
756
+ }
757
+ }
758
+
759
+ std::uint32_t read_u32() {
760
+ std::uint32_t ret;
761
+ read_raw(&ret, sizeof(ret));
762
+ return ret;
763
+ }
764
+
765
+ std::string read_string(std::uint32_t len) {
766
+ std::vector<char> chars(len);
767
+ read_raw(chars.data(), len);
768
+ return std::string(chars.data(), len);
769
+ }
770
+
771
+ void write_raw(const void * ptr, size_t size) {
772
+ if (size == 0) {
773
+ return;
774
+ }
775
+ errno = 0;
776
+ size_t ret = std::fwrite(ptr, size, 1, fp);
777
+ if (ret != 1) {
778
+ die_fmt("write error: %s", strerror(errno));
779
+ }
780
+ }
781
+
782
+ void write_u32(std::uint32_t val) {
783
+ write_raw(&val, sizeof(val));
784
+ }
785
+
786
+ ~llama_file() {
787
+ if (fp) {
788
+ std::fclose(fp);
789
+ }
790
+ }
791
+ };
792
+
793
+ static size_t utf8_len(char src) {
794
+ const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
795
+ uint8_t highbits = static_cast<uint8_t>(src) >> 4;
796
+ return lookup[highbits];
797
+ }
798
+
799
+ // mark each byte with its utf8 unit number.
800
+ // returns the number of utf8 characters.
801
+ // e.g. when bytes == '\x61\xD0\xB0\x62',
802
+ // then utf8_units will become [0,0,1,0]
803
+ // utf8_nunits will become [1,2,2,1] and 3 is returned.
804
+ // bytes where utf8_units is zero, are the begin of an utf8 character.
805
+ static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nunits, size_t count) {
806
+ size_t offs = 0;
807
+ size_t count_utf8 = 0;
808
+ while(offs < count) {
809
+ int len = (int) utf8_len(bytes[offs]);
810
+ for (int i=0; i<len; ++i) {
811
+ utf8_units[offs+i] = i;
812
+ utf8_nunits[offs+i] = len;
813
+ }
814
+ offs += len;
815
+ ++count_utf8;
816
+ }
817
+ return count_utf8;
818
+ }
819
+
820
+ size_t tokenize_file(
821
+ struct llama_context * lctx,
822
+ const char * filename,
823
+ const std::string & sample_start,
824
+ bool include_sample_start,
825
+ bool overlapping_samples,
826
+ unsigned context_length,
827
+ std::vector<llama_token> & out_tokens,
828
+ std::vector<size_t> & out_samples_begin,
829
+ std::vector<size_t> & out_samples_size) {
830
+ struct llama_file f(filename, "rb");
831
+
832
+ if (f.size == 0) {
833
+ out_tokens.clear();
834
+ out_samples_begin.clear();
835
+ out_samples_size.clear();
836
+ printf("%s: warning: empty or not existing training data file '%s'\n",
837
+ __func__, filename);
838
+ return out_tokens.size();
839
+ }
840
+
841
+ // account for possible leading whitespace that will be added by tokenizer
842
+ // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
843
+ const int n_max_tokens_overhead = 1;
844
+
845
+ std::vector<char> buf;
846
+ buf.resize(f.size);
847
+
848
+ f.read_raw(buf.data(), f.size);
849
+
850
+ std::vector<int> utf8_units;
851
+ std::vector<int> utf8_nunits;
852
+ utf8_units.resize(buf.size());
853
+ utf8_nunits.resize(buf.size());
854
+ mark_utf8_units(buf.data(), utf8_units.data(), utf8_nunits.data(), buf.size());
855
+
856
+ if (sample_start.size() == 0) {
857
+ // tokenize all data at once
858
+ out_tokens.resize(buf.size() + n_max_tokens_overhead);
859
+
860
+ int n_tokens = llama_tokenize(
861
+ llama_get_model(lctx),
862
+ buf.data(),
863
+ (int) buf.size(),
864
+ out_tokens.data(),
865
+ (int) out_tokens.size(),
866
+ false);
867
+ if (n_tokens < 0) {
868
+ out_tokens.resize(-n_tokens);
869
+ n_tokens = llama_tokenize(
870
+ llama_get_model(lctx),
871
+ buf.data(),
872
+ (int) buf.size(),
873
+ out_tokens.data(),
874
+ (int) out_tokens.size(),
875
+ false);
876
+ }
877
+ if (n_tokens >= 0) {
878
+ out_tokens.resize(n_tokens);
879
+ }
880
+
881
+ // generate sample starts at all token positions
882
+ out_samples_begin.clear();
883
+ out_samples_begin.push_back(0);
884
+ out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size()));
885
+ size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0;
886
+ for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) {
887
+ out_samples_begin.push_back(sample_begin);
888
+ out_samples_size.push_back(context_length);
889
+ }
890
+ } else {
891
+ // split data into samples and tokenize each sample
892
+ std::string data_str(buf.data(), buf.size());
893
+ out_samples_begin.clear();
894
+ out_samples_size.clear();
895
+ out_tokens.clear();
896
+
897
+ // find all positions of pattern sample_start
898
+ size_t sample_begin = data_str.find(sample_start, 0);
899
+ while (sample_begin != std::string::npos) {
900
+ out_samples_begin.push_back(sample_begin);
901
+ const size_t search_start = sample_begin + sample_start.size();
902
+ sample_begin = data_str.find(sample_start, search_start);
903
+ }
904
+ if (out_samples_begin.size() == 0) {
905
+ printf("%s: warning: sample start pattern '%s' not found. inserting single sample at data begin\n",
906
+ __func__, sample_start.c_str());
907
+ out_samples_begin.push_back(0);
908
+ }
909
+
910
+ out_samples_size.resize(out_samples_begin.size(), 0);
911
+
912
+ std::vector<char> buf_sample;
913
+ std::vector<llama_token> tok_sample;
914
+
915
+ const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size());
916
+ size_t found_too_big_sample = 0;
917
+ size_t found_too_small_sample = 0;
918
+ size_t found_empty_sample = 0;
919
+ size_t found_min_sample_size = SIZE_MAX;
920
+ size_t found_max_sample_size = 0;
921
+
922
+ size_t max_token_text_size = 0;
923
+ int n_vocab = llama_n_vocab(llama_get_model(lctx));
924
+ for (llama_token token=0; token < n_vocab; ++token) {
925
+ max_token_text_size = std::max(
926
+ max_token_text_size,
927
+ strlen(llama_token_get_text(lctx, token)));
928
+ }
929
+
930
+ // upper bound of context byte length.
931
+ // strings with this byte length should always tokenize to at least context_length tokens.
932
+ size_t context_byte_len = max_token_text_size*context_length;
933
+
934
+ for (unsigned i=0; i<out_samples_begin.size(); ++i) {
935
+ // determine sample begin and end from pattern positions
936
+ size_t sample_begin = out_samples_begin[i] + sample_begin_offset;
937
+ size_t sample_end = overlapping_samples
938
+ ? std::min(
939
+ data_str.size(),
940
+ sample_begin + context_byte_len)
941
+ : (i+1 < out_samples_begin.size()
942
+ ? out_samples_begin[i+1]
943
+ : data_str.size());
944
+ if (sample_end < utf8_units.size() && utf8_units[sample_end] > 0) {
945
+ // sample end is in the middle of an utf8 character.
946
+ // advance sample_end to the begin of the next utf8 character.
947
+ sample_end += utf8_nunits[sample_end] - utf8_units[sample_end];
948
+ }
949
+ size_t sample_size = sample_end - sample_begin;
950
+ if (sample_size == 0) {
951
+ ++found_empty_sample;
952
+ }
953
+
954
+ if (sample_size > 0) {
955
+ // llama_tokenize expects zero terminated string,
956
+ // copy sample into buffer and zero terminate it.
957
+ buf_sample.resize(sample_size);
958
+ memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
959
+
960
+ // printf("sample: '%s'\n", buf_sample.data());
961
+
962
+ // tokenize the sample
963
+ tok_sample.resize(buf_sample.size() + n_max_tokens_overhead);
964
+ int n_tokens = llama_tokenize(llama_get_model(lctx),
965
+ buf_sample.data(),
966
+ (int) buf_sample.size(),
967
+ tok_sample.data(),
968
+ (int) tok_sample.size(),
969
+ false);
970
+ if (n_tokens < 0) {
971
+ tok_sample.resize(-n_tokens);
972
+ n_tokens = llama_tokenize(llama_get_model(lctx),
973
+ buf_sample.data(),
974
+ (int) buf_sample.size(),
975
+ tok_sample.data(),
976
+ (int) tok_sample.size(),
977
+ false);
978
+ GGML_ASSERT(n_tokens >= 0);
979
+ }
980
+ GGML_ASSERT(n_tokens <= (int) tok_sample.size());
981
+
982
+ if ((size_t) n_tokens > context_length) {
983
+ ++found_too_big_sample;
984
+ } else if ((size_t) n_tokens < context_length) {
985
+ ++found_too_small_sample;
986
+ }
987
+ found_max_sample_size = std::max(found_max_sample_size, (size_t) n_tokens);
988
+ found_min_sample_size = std::min(found_min_sample_size, (size_t) n_tokens);
989
+
990
+ // write out tokens, start and size of sample
991
+ // overwrite the string start position with the token start position
992
+ out_samples_begin[i] = out_tokens.size();
993
+ out_samples_size[i] = (size_t) n_tokens;
994
+ out_tokens.insert(out_tokens.end(), tok_sample.begin(), tok_sample.begin() + n_tokens);
995
+ } else {
996
+ out_samples_begin[i] = out_tokens.size();
997
+ out_samples_size[i] = 0;
998
+ }
999
+
1000
+ }
1001
+ if (found_too_big_sample > 0) {
1002
+ printf("%s: warning: found %zu samples (max length %zu) that exceed context length of %u. samples will be cut off.\n",
1003
+ __func__, found_too_big_sample, found_max_sample_size, context_length);
1004
+ }
1005
+
1006
+ if (found_too_small_sample > 0) {
1007
+ printf("%s: warning: found %zu samples (min length %zu) that are shorter than context length of %u.\n",
1008
+ __func__, found_too_small_sample, found_min_sample_size, context_length);
1009
+ }
1010
+
1011
+ if (found_empty_sample) {
1012
+ printf("%s: warning: found %zu empty samples.\n",
1013
+ __func__, found_empty_sample);
1014
+ }
1015
+ }
1016
+ printf("%s: total number of samples: %zu\n",
1017
+ __func__, out_samples_begin.size());
1018
+
1019
+ GGML_ASSERT(out_samples_begin.size() == out_samples_size.size());
1020
+
1021
+ return out_tokens.size();
1022
+ }
1023
+
1024
+ std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration) {
1025
+ std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
1026
+ return replace_str(filename, pattern_it, sit.c_str());
1027
+ }
1028
+
1029
+ struct train_params_common get_default_train_params_common() {
1030
+ struct train_params_common params;
1031
+ params.fn_train_data = "shakespeare.txt";
1032
+ params.fn_checkpoint_in = "checkpoint.gguf";
1033
+ params.fn_checkpoint_out = "checkpoint-ITERATION.gguf";
1034
+ params.pattern_fn_it = "ITERATION";
1035
+ params.fn_latest = "LATEST";
1036
+
1037
+ params.print_usage = false;
1038
+
1039
+ params.save_every = 10;
1040
+
1041
+ params.seed = -1;
1042
+
1043
+ params.n_ctx = 128;
1044
+ params.n_threads = 6;
1045
+ params.n_batch = 8;
1046
+ params.n_gradient_accumulation = 1;
1047
+ params.n_epochs = -1;
1048
+
1049
+ params.custom_n_ctx = false;
1050
+
1051
+ params.use_flash = true;
1052
+ params.use_checkpointing = true;
1053
+
1054
+ params.sample_start = "";
1055
+ params.include_sample_start = false;
1056
+ params.escape = false;
1057
+ params.overlapping_samples = false;
1058
+ params.fill_with_next_samples = false;
1059
+ params.separate_with_eos = false;
1060
+ params.separate_with_bos = true;
1061
+ params.sample_random_offsets = false;
1062
+ params.force_reshuffle = false;
1063
+
1064
+ params.opt_past = 0;
1065
+ params.opt_delta = 1e-5f;
1066
+ params.opt_max_no_improvement = 0;
1067
+
1068
+ params.warmup = 100;
1069
+ params.cos_decay_steps = 1000;
1070
+ params.cos_decay_restart = 1.1f;
1071
+ params.cos_decay_min = 0.1f;
1072
+ params.enable_restart = false;
1073
+
1074
+ params.adam_n_iter = 256;
1075
+ params.adam_alpha = 1e-3f;
1076
+ params.adam_min_alpha = 0;
1077
+ params.adam_decay = 1e-1f;
1078
+ params.adam_decay_min_ndim = 2;
1079
+ params.adam_beta1 = 0.9f;
1080
+ params.adam_beta2 = 0.999f;
1081
+ params.adam_gclip = 1.0f;
1082
+ params.adam_eps_f = 0.0f;
1083
+ return params;
1084
+ }
1085
+
1086
+ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train_params_common * params) {
1087
+ // fprintf(stderr, "usage: %s [options]\n", argv[0]);
1088
+ // fprintf(stderr, "\n");
1089
+ // fprintf(stderr, "options:\n");
1090
+ // fprintf(stderr, " -h, --help show this help message and exit\n");
1091
+ fprintf(stderr, " --train-data FNAME path from which to load training data (default '%s')\n", params->fn_train_data);
1092
+ fprintf(stderr, " --checkpoint-in FNAME path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
1093
+ fprintf(stderr, " --checkpoint-out FNAME path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
1094
+ fprintf(stderr, " --pattern-fn-it STR pattern in output filenames to be replaced by iteration number (default '%s')\n", params->pattern_fn_it);
1095
+ fprintf(stderr, " --fn-latest STR string to use instead of iteration number for saving latest output (default '%s')\n", params->fn_latest);
1096
+ fprintf(stderr, " --save-every N save checkpoint and lora every N iterations. Disabled when N <= 0. (default '%d')\n", params->save_every);
1097
+ fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for -1)\n");
1098
+ fprintf(stderr, " -c N, --ctx N Context size used during training (default %d)\n", params->n_ctx);
1099
+ fprintf(stderr, " -t N, --threads N Number of threads (default %d)\n", params->n_threads);
1100
+ fprintf(stderr, " -b N, --batch N Parallel batch size (default %d)\n", params->n_batch);
1101
+ fprintf(stderr, " --grad-acc N Number of gradient accumulation steps (simulates larger batch size of batch*gradacc) (default %d)\n", params->n_gradient_accumulation);
1102
+ fprintf(stderr, " --sample-start STR Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str());
1103
+ fprintf(stderr, " --include-sample-start Include the sample start in the samples. (default off)\n");
1104
+ fprintf(stderr, " --escape process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
1105
+ fprintf(stderr, " --overlapping-samples Samples my overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
1106
+ fprintf(stderr, " --fill-with-next-samples Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n");
1107
+ fprintf(stderr, " --separate-with-eos When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : "");
1108
+ fprintf(stderr, " --separate-with-bos When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");
1109
+ fprintf(stderr, " --no-separate-with-eos When fill-with-next-samples, don't insert end-of-sequence token between samples.%s\n", !params->separate_with_eos ? " (default)" : "");
1110
+ fprintf(stderr, " --no-separate-with-bos When fill-with-next-samples, don't insert begin-of-sequence token between samples.%s\n", !params->separate_with_bos ? " (default)" : "");
1111
+ fprintf(stderr, " --sample-random-offsets Use samples beginning at random offsets. Together with fill-with-next-samples this may help for training endless text generation.%s\n", params->sample_random_offsets ? " (default)" : "");
1112
+ fprintf(stderr, " --force-reshuffle Force a reshuffling of data at program start, otherwise the shuffling of loaded checkpoint is resumed.\n");
1113
+ fprintf(stderr, " --no-flash Don't use flash attention \n");
1114
+ fprintf(stderr, " --use-flash Use flash attention (default)\n");
1115
+ fprintf(stderr, " --no-checkpointing Don't use gradient checkpointing\n");
1116
+ fprintf(stderr, " --use-checkpointing Use gradient checkpointing (default)\n");
1117
+ fprintf(stderr, " --warmup N Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup);
1118
+ fprintf(stderr, " --cos-decay-steps N Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
1119
+ fprintf(stderr, " --cos-decay-restart N Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
1120
+ fprintf(stderr, " --cos-decay-min N Only for Adam optimizer. Cosine decay minimum (default %f)\n", params->cos_decay_min);
1121
+ fprintf(stderr, " --enable-restart N Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : "");
1122
+ fprintf(stderr, " --disable-restart N Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : "");
1123
+ fprintf(stderr, " --opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past);
1124
+ fprintf(stderr, " --opt-delta N Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta);
1125
+ fprintf(stderr, " --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement);
1126
+ fprintf(stderr, " --epochs N Maximum number epochs to process. (default %d)\n", params->n_epochs);
1127
+ fprintf(stderr, " --adam-iter N Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
1128
+ fprintf(stderr, " --adam-alpha N Adam learning rate alpha (default %f)\n", params->adam_alpha);
1129
+ fprintf(stderr, " --adam-min-alpha N Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha);
1130
+ fprintf(stderr, " --adam-decay N AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
1131
+ fprintf(stderr, " --adam-decay-min-ndim N Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim);
1132
+ fprintf(stderr, " --adam-beta1 N AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
1133
+ fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
1134
+ fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
1135
+ fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
1136
+ fprintf(stderr, "\n");
1137
+ }
1138
+
1139
+ bool consume_common_train_arg(
1140
+ int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param
1141
+ ) {
1142
+ int& i = *idx;
1143
+ std::string arg = argv[i];
1144
+ const std::string arg_prefix = "--";
1145
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
1146
+ std::replace(arg.begin(), arg.end(), '_', '-');
1147
+ }
1148
+ if (arg == "--train-data") {
1149
+ if (++i >= argc) {
1150
+ *invalid_param = true;
1151
+ return true;
1152
+ }
1153
+ params->fn_train_data = argv[i];
1154
+ } else if (arg == "--checkpoint-in") {
1155
+ if (++i >= argc) {
1156
+ *invalid_param = true;
1157
+ return true;
1158
+ }
1159
+ params->fn_checkpoint_in = argv[i];
1160
+ } else if (arg == "--checkpoint-out") {
1161
+ if (++i >= argc) {
1162
+ *invalid_param = true;
1163
+ return true;
1164
+ }
1165
+ params->fn_checkpoint_out = argv[i];
1166
+ } else if (arg == "--pattern-fn-it") {
1167
+ if (++i >= argc) {
1168
+ *invalid_param = true;
1169
+ return true;
1170
+ }
1171
+ params->pattern_fn_it = argv[i];
1172
+ } else if (arg == "--fn-latest") {
1173
+ if (++i >= argc) {
1174
+ *invalid_param = true;
1175
+ return true;
1176
+ }
1177
+ params->fn_latest = argv[i];
1178
+ } else if (arg == "--save-every") {
1179
+ if (++i >= argc) {
1180
+ *invalid_param = true;
1181
+ return true;
1182
+ }
1183
+ params->save_every = std::stoi(argv[i]);
1184
+ } else if (arg == "-s" || arg == "--seed") {
1185
+ if (++i >= argc) {
1186
+ *invalid_param = true;
1187
+ return true;
1188
+ }
1189
+ params->seed = std::stoi(argv[i]);
1190
+ } else if (arg == "-c" || arg == "--ctx") {
1191
+ if (++i >= argc) {
1192
+ *invalid_param = true;
1193
+ return true;
1194
+ }
1195
+ params->n_ctx = std::stoi(argv[i]);
1196
+ params->custom_n_ctx = true;
1197
+ } else if (arg == "-t" || arg == "--threads") {
1198
+ if (++i >= argc) {
1199
+ *invalid_param = true;
1200
+ return true;
1201
+ }
1202
+ params->n_threads = std::stoi(argv[i]);
1203
+ } else if (arg == "-b" || arg == "--batch") {
1204
+ if (++i >= argc) {
1205
+ *invalid_param = true;
1206
+ return true;
1207
+ }
1208
+ params->n_batch = std::stoi(argv[i]);
1209
+ } else if (arg == "--grad-acc") {
1210
+ if (++i >= argc) {
1211
+ *invalid_param = true;
1212
+ return true;
1213
+ }
1214
+ params->n_gradient_accumulation = std::max(1, std::stoi(argv[i]));
1215
+ } else if (arg == "--sample-start") {
1216
+ if (++i >= argc) {
1217
+ *invalid_param = true;
1218
+ return true;
1219
+ }
1220
+ params->sample_start = std::string(argv[i]);
1221
+ } else if (arg == "--escape") {
1222
+ params->escape = true;
1223
+ } else if (arg == "--include-sample-start") {
1224
+ params->include_sample_start = true;
1225
+ } else if (arg == "--overlapping-samples") {
1226
+ params->overlapping_samples = true;
1227
+ } else if (arg == "--fill-with-next-samples") {
1228
+ params->fill_with_next_samples = true;
1229
+ } else if (arg == "--separate-with-eos") {
1230
+ params->separate_with_eos = true;
1231
+ } else if (arg == "--separate-with-bos") {
1232
+ params->separate_with_bos = true;
1233
+ } else if (arg == "--no-separate-with-eos") {
1234
+ params->separate_with_eos = false;
1235
+ } else if (arg == "--no-separate-with-bos") {
1236
+ params->separate_with_bos = false;
1237
+ } else if (arg == "--sample-random-offsets") {
1238
+ params->sample_random_offsets = true;
1239
+ } else if (arg == "--force-reshuffle") {
1240
+ params->force_reshuffle = true;
1241
+ } else if (arg == "--no-flash") {
1242
+ params->use_flash = false;
1243
+ } else if (arg == "--use-flash") {
1244
+ params->use_flash = true;
1245
+ } else if (arg == "--no-checkpointing") {
1246
+ params->use_checkpointing = false;
1247
+ } else if (arg == "--use-checkpointing") {
1248
+ params->use_checkpointing = true;
1249
+ } else if (arg == "--warmup") {
1250
+ if (++i >= argc) {
1251
+ *invalid_param = true;
1252
+ return true;
1253
+ }
1254
+ params->warmup = std::stoi(argv[i]);
1255
+ } else if (arg == "--cos-decay-steps") {
1256
+ if (++i >= argc) {
1257
+ *invalid_param = true;
1258
+ return true;
1259
+ }
1260
+ params->cos_decay_steps = std::stoi(argv[i]);
1261
+ } else if (arg == "--cos-decay-restart") {
1262
+ if (++i >= argc) {
1263
+ *invalid_param = true;
1264
+ return true;
1265
+ }
1266
+ params->cos_decay_restart = std::stof(argv[i]);
1267
+ } else if (arg == "--cos-decay-min") {
1268
+ if (++i >= argc) {
1269
+ *invalid_param = true;
1270
+ return true;
1271
+ }
1272
+ params->cos_decay_min = std::stof(argv[i]);
1273
+ } else if (arg == "--enable-restart") {
1274
+ params->enable_restart = true;
1275
+ } else if (arg == "--disable-restart") {
1276
+ params->enable_restart = false;
1277
+ } else if (arg == "--opt-past") {
1278
+ if (++i >= argc) {
1279
+ *invalid_param = true;
1280
+ return true;
1281
+ }
1282
+ params->opt_past = std::stoi(argv[i]);
1283
+ } else if (arg == "--opt-delta") {
1284
+ if (++i >= argc) {
1285
+ *invalid_param = true;
1286
+ return true;
1287
+ }
1288
+ params->opt_delta = std::stof(argv[i]);
1289
+ } else if (arg == "--opt-max-no-improvement") {
1290
+ if (++i >= argc) {
1291
+ *invalid_param = true;
1292
+ return true;
1293
+ }
1294
+ params->opt_max_no_improvement = std::stoi(argv[i]);
1295
+ } else if (arg == "--adam-epsf") {
1296
+ if (++i >= argc) {
1297
+ *invalid_param = true;
1298
+ return true;
1299
+ }
1300
+ params->adam_eps_f = std::stof(argv[i]);
1301
+ } else if (arg == "--epochs") {
1302
+ if (++i >= argc) {
1303
+ *invalid_param = true;
1304
+ return true;
1305
+ }
1306
+ params->n_epochs = std::stoi(argv[i]);
1307
+ } else if (arg == "--adam-iter") {
1308
+ if (++i >= argc) {
1309
+ *invalid_param = true;
1310
+ return true;
1311
+ }
1312
+ params->adam_n_iter = std::stoi(argv[i]);
1313
+ } else if (arg == "--adam-alpha") {
1314
+ if (++i >= argc) {
1315
+ *invalid_param = true;
1316
+ return true;
1317
+ }
1318
+ params->adam_alpha = std::stof(argv[i]);
1319
+ } else if (arg == "--adam-min-alpha") {
1320
+ if (++i >= argc) {
1321
+ *invalid_param = true;
1322
+ return true;
1323
+ }
1324
+ params->adam_min_alpha = std::stof(argv[i]);
1325
+ } else if (arg == "--adam-decay") {
1326
+ if (++i >= argc) {
1327
+ *invalid_param = true;
1328
+ return true;
1329
+ }
1330
+ params->adam_decay = std::stof(argv[i]);
1331
+ } else if (arg == "--adam-decay-min-ndim") {
1332
+ if (++i >= argc) {
1333
+ *invalid_param = true;
1334
+ return true;
1335
+ }
1336
+ params->adam_decay_min_ndim = std::stoi(argv[i]);
1337
+ } else if (arg == "--adam-beta1") {
1338
+ if (++i >= argc) {
1339
+ *invalid_param = true;
1340
+ return true;
1341
+ }
1342
+ params->adam_beta1 = std::stof(argv[i]);
1343
+ } else if (arg == "--adam-beta2") {
1344
+ if (++i >= argc) {
1345
+ *invalid_param = true;
1346
+ return true;
1347
+ }
1348
+ params->adam_beta2 = std::stof(argv[i]);
1349
+ } else if (arg == "--adam-gclip") {
1350
+ if (++i >= argc) {
1351
+ *invalid_param = true;
1352
+ return true;
1353
+ }
1354
+ params->adam_gclip = std::stof(argv[i]);
1355
+ } else if (arg == "-h" || arg == "--help") {
1356
+ params->print_usage = true;
1357
+ return true;
1358
+ } else {
1359
+ return false;
1360
+ }
1361
+ return true;
1362
+ }
1363
+
1364
+ void finish_processing_train_args(struct train_params_common * params) {
1365
+ if (params->escape) {
1366
+ process_escapes(params->sample_start);
1367
+ }
1368
+ }
1369
+
1370
+ void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel) {
1371
+ struct train_opt_callback_data * data = (struct train_opt_callback_data *) vdata;
1372
+ struct train_params_common * params = data->params;
1373
+ struct train_state * train = data->train;
1374
+ struct ggml_opt_context * opt = train->opt;
1375
+ int n_batch = params->n_batch;
1376
+ int n_ctx = params->n_ctx;
1377
+
1378
+ if (accum_step == 0) {
1379
+ // time measurement
1380
+ int64_t now = ggml_time_ms();
1381
+ if (now > data->last_time && opt->iter > data->first_iter) {
1382
+ double dt = (double) (now - data->last_time);
1383
+ if (data->millis_per_iter == 0.0) {
1384
+ data->millis_per_iter = dt;
1385
+ } else {
1386
+ const double gain = 0.7;
1387
+ data->millis_per_iter = data->millis_per_iter*(1.0-gain) + dt*gain;
1388
+ }
1389
+ }
1390
+
1391
+ double remaining_millis = 0.0;
1392
+ if (data->millis_per_iter > 0.0) {
1393
+ const int n_iter = params->adam_n_iter;
1394
+ const int done_iter = opt->iter - data->first_iter;
1395
+ const int remaining_iter = n_iter - done_iter;
1396
+ remaining_millis = remaining_iter * data->millis_per_iter;
1397
+ }
1398
+
1399
+ // file saving
1400
+ const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
1401
+ if (save_now) {
1402
+ int new_iters = opt->iter - data->last_save_iter;
1403
+ train->train_its += new_iters;
1404
+ train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
1405
+
1406
+ if (data->save_cb) {
1407
+ data->save_cb(data->save_data, train);
1408
+ }
1409
+
1410
+ data->last_save_iter = opt->iter;
1411
+ }
1412
+
1413
+ // exclude file saving from time measurement, by measuring last_time after saving
1414
+ data->last_time = ggml_time_ms();
1415
+
1416
+ *sched = learning_schedule(
1417
+ opt->iter,
1418
+ params->warmup,
1419
+ params->cos_decay_steps,
1420
+ params->adam_alpha,
1421
+ params->adam_min_alpha,
1422
+ params->cos_decay_min,
1423
+ params->cos_decay_restart,
1424
+ params->enable_restart);
1425
+
1426
+ int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
1427
+ if (impr_plot > 0) impr_plot = 0;
1428
+ if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
1429
+ printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
1430
+ __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
1431
+ *sched, opt->loss_after);
1432
+
1433
+
1434
+ if (data->millis_per_iter > 0) {
1435
+ printf(" dt=");
1436
+ print_duration(data->millis_per_iter);
1437
+ printf(" eta=");
1438
+ print_duration(remaining_millis);
1439
+ }
1440
+
1441
+ float improvement = opt->loss_before - opt->loss_after;
1442
+ const float plot_scale = 10.0f;
1443
+ int bar_len = (int)(1 + improvement*plot_scale + 0.5);
1444
+ printf(" |");
1445
+ for (int i=0; i<bar_len; ++i) {
1446
+ printf("-");
1447
+ }
1448
+ printf(">");
1449
+ printf("\n");
1450
+ }
1451
+
1452
+ int64_t used_samples = get_example_targets_batch(
1453
+ data->lctx,
1454
+ data->tokens_input,
1455
+ data->target_probs,
1456
+ train->shuffle_next_sample,
1457
+ data->shuffled_samples_offs,
1458
+ data->shuffled_samples_begin,
1459
+ data->shuffled_samples_size,
1460
+ data->samples_count,
1461
+ data->tokens_data,
1462
+ data->tokens_size,
1463
+ params->separate_with_eos,
1464
+ params->separate_with_bos,
1465
+ params->fill_with_next_samples,
1466
+ params->sample_random_offsets);
1467
+
1468
+ train->train_samples += used_samples;
1469
+ train->shuffle_next_sample += used_samples;
1470
+
1471
+ if (train->shuffle_next_sample >= train->shuffle_sample_count) {
1472
+ ++train->train_epochs;
1473
+ printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) train->train_epochs);
1474
+ // note: we may have used some samples from the current shuffling more than once
1475
+ train->shuffle_rng_state_current = train->shuffle_rng_state_next;
1476
+ train->shuffle_rng_state_next = shuffle_samples(
1477
+ train->shuffle_rng_state_current,
1478
+ data->shuffled_samples_offs,
1479
+ data->shuffled_samples_begin,
1480
+ data->shuffled_samples_size,
1481
+ data->samples_begin,
1482
+ data->samples_size,
1483
+ data->samples_count);
1484
+ train->shuffle_next_sample = 0;
1485
+ }
1486
+
1487
+ const bool last_epoch_reached = (params->n_epochs > 0 && (int64_t) train->train_epochs - data->first_epoch >= params->n_epochs);
1488
+ if (last_epoch_reached) {
1489
+ // allow optimization iteration at last epoch to be completed before canceling
1490
+ if (data->iter_at_last_epoch < 0) {
1491
+ data->iter_at_last_epoch = opt->iter;
1492
+ } else if (opt->iter > data->iter_at_last_epoch) {
1493
+ *cancel = true;
1494
+ }
1495
+ }
1496
+ }
common/train.h ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Various helper functions and utilities for training
2
+
3
+ #pragma once
4
+
5
+ #include <string>
6
+ #include <random>
7
+ #include <vector>
8
+
9
+ #include "ggml.h"
10
+ #include "llama.h"
11
+
12
+ typedef std::string mt19937_state;
13
+
14
+ struct train_state {
15
+ struct ggml_opt_context * opt;
16
+
17
+ uint64_t train_its;
18
+ uint64_t train_samples;
19
+ uint64_t train_tokens;
20
+ uint64_t train_epochs;
21
+
22
+ size_t shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
23
+ mt19937_state shuffle_rng_state_current;
24
+ mt19937_state shuffle_rng_state_next;
25
+ size_t shuffle_sample_count;
26
+ size_t shuffle_next_sample;
27
+ };
28
+
29
+ struct train_params_common {
30
+ const char * fn_train_data;
31
+ const char * fn_checkpoint_in;
32
+ const char * fn_checkpoint_out;
33
+ const char * pattern_fn_it;
34
+ const char * fn_latest;
35
+
36
+ bool print_usage;
37
+
38
+ int save_every;
39
+
40
+ uint32_t seed;
41
+
42
+ int n_ctx;
43
+ int n_threads;
44
+ int n_batch;
45
+ int n_gradient_accumulation;
46
+ int n_epochs;
47
+
48
+ bool custom_n_ctx;
49
+
50
+ bool use_flash;
51
+ bool use_checkpointing;
52
+
53
+ std::string sample_start;
54
+ bool include_sample_start;
55
+ bool escape;
56
+ bool overlapping_samples;
57
+ bool fill_with_next_samples;
58
+ bool separate_with_eos;
59
+ bool separate_with_bos;
60
+ bool sample_random_offsets;
61
+
62
+ bool force_reshuffle;
63
+
64
+ int warmup;
65
+ int cos_decay_steps;
66
+ float cos_decay_restart;
67
+ float cos_decay_min;
68
+ bool enable_restart;
69
+
70
+ int opt_past;
71
+ float opt_delta;
72
+ int opt_max_no_improvement;
73
+
74
+ int adam_n_iter;
75
+ float adam_alpha;
76
+ float adam_min_alpha;
77
+ float adam_decay;
78
+ int adam_decay_min_ndim;
79
+ float adam_beta1;
80
+ float adam_beta2;
81
+ float adam_gclip;
82
+ float adam_eps_f;
83
+ };
84
+
85
+ typedef void (*save_train_files_callback)(void * data, struct train_state * train);
86
+
87
+ struct train_opt_callback_data {
88
+ struct train_params_common * params;
89
+ struct train_state * train;
90
+ save_train_files_callback save_cb;
91
+ void * save_data;
92
+ struct llama_context * lctx;
93
+ int last_save_iter;
94
+ llama_token * tokens_data;
95
+ size_t tokens_size;
96
+ size_t * samples_begin;
97
+ size_t * samples_size;
98
+ size_t * shuffled_samples_offs;
99
+ size_t * shuffled_samples_begin;
100
+ size_t * shuffled_samples_size;
101
+ size_t samples_count;
102
+ struct ggml_tensor * tokens_input;
103
+ struct ggml_tensor * target_probs;
104
+ int first_iter;
105
+ int first_epoch;
106
+ int iter_at_last_epoch;
107
+ int64_t last_time;
108
+ double millis_per_iter;
109
+ };
110
+
111
+ struct train_state * init_train_state();
112
+ void free_train_state(struct train_state * state);
113
+
114
+ struct train_params_common get_default_train_params_common();
115
+ void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params);
116
+
117
+ bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param);
118
+ void finish_processing_train_args(struct train_params_common * params);
119
+
120
+ struct random_normal_distribution;
121
+ struct random_uniform_distribution;
122
+
123
+ struct random_normal_distribution * init_random_normal_distribution (int seed, float mean, float std, float min, float max);
124
+ struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max);
125
+
126
+ void free_random_normal_distribution (struct random_normal_distribution * rnd);
127
+ void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
128
+
129
+ struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
130
+ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
131
+
132
+ // generate random float in interval [0,1)
133
+ float frand();
134
+ float frand_normal (struct random_normal_distribution * rnd);
135
+ float frand_uniform(struct random_uniform_distribution * rnd);
136
+
137
+ int clamp (const int v, const int min, const int max);
138
+ float fclamp(const float v, const float min, const float max);
139
+
140
+ void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0);
141
+ void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1);
142
+ void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2);
143
+ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
144
+
145
+ size_t tokenize_file(
146
+ struct llama_context * lctx,
147
+ const char * filename,
148
+ const std::string & sample_start,
149
+ bool include_sample_start,
150
+ bool overlapping_samples,
151
+ unsigned context_length,
152
+ std::vector<llama_token> & out_tokens,
153
+ std::vector<size_t> & out_samples_begin,
154
+ std::vector<size_t> & out_samples_size);
155
+
156
+ int64_t get_example_targets_batch(
157
+ struct llama_context * lctx,
158
+ struct ggml_tensor * tokens_input,
159
+ struct ggml_tensor * target_probs,
160
+ int64_t example_id,
161
+ const size_t * samples_offs,
162
+ const size_t * samples_begin,
163
+ const size_t * samples_size,
164
+ size_t samples_count,
165
+ const llama_token * train_data,
166
+ size_t n_train_data,
167
+ bool separate_with_eos,
168
+ bool separate_with_bos,
169
+ bool fill_with_next_samples,
170
+ bool sample_random_offsets);
171
+
172
+
173
+ void mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
174
+ mt19937_state mt19937_get_state(const std::mt19937& rng);
175
+ mt19937_state mt19937_seed_to_state(unsigned seed);
176
+
177
+ mt19937_state shuffle_samples(
178
+ const mt19937_state & rng_state,
179
+ size_t * shuffled_offs,
180
+ size_t * shuffled_begins,
181
+ size_t * shuffled_sizes,
182
+ const size_t * begins,
183
+ const size_t * sizes,
184
+ size_t count);
185
+
186
+ size_t hash_combine(size_t h1, size_t h2);
187
+
188
+ size_t compute_samples_hash(
189
+ const char* fn,
190
+ const size_t* samples_begin,
191
+ const size_t* samples_size,
192
+ size_t sample_count);
193
+
194
+
195
+ std::string replace_str(const char * s, const char * needle, const char * replacement);
196
+
197
+ void print_duration(double milliseconds);
198
+
199
+ float cosine_decay(
200
+ int64_t step,
201
+ int64_t decay_steps,
202
+ float minimum);
203
+
204
+ float cosine_decay_restart(
205
+ int64_t step,
206
+ int64_t decay_steps,
207
+ float minimum,
208
+ float restart_step_mult);
209
+
210
+ float learning_schedule(
211
+ int64_t step,
212
+ int64_t warmup_steps,
213
+ int64_t decay_steps,
214
+ float learning_rate,
215
+ float overall_minimum,
216
+ float cos_decay_minimum,
217
+ float cos_decay_restart_step_mult,
218
+ bool enable_restart);
219
+
220
+ void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name);
221
+
222
+ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
223
+ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
224
+
225
+ bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
226
+ void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
227
+
228
+ std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
229
+
230
+ void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);
convert-falcon-hf-to-gguf.py CHANGED
@@ -133,8 +133,6 @@ gguf_writer.add_file_type(ftype)
133
  print("gguf: get tokenizer metadata")
134
 
135
  tokens: list[bytearray] = []
136
- scores: list[float] = []
137
- toktypes: list[int] = []
138
 
139
  tokenizer_json_file = dir_model / 'tokenizer.json'
140
  if not tokenizer_json_file.is_file():
@@ -177,12 +175,8 @@ for i in range(vocab_size):
177
  text = bytearray(pad_token)
178
 
179
  tokens.append(text)
180
- scores.append(0.0) # dymmy
181
- toktypes.append(gguf.TokenType.NORMAL) # dummy
182
 
183
  gguf_writer.add_token_list(tokens)
184
- gguf_writer.add_token_scores(scores)
185
- gguf_writer.add_token_types(toktypes)
186
 
187
  special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
188
  special_vocab.add_to_gguf(gguf_writer)
 
133
  print("gguf: get tokenizer metadata")
134
 
135
  tokens: list[bytearray] = []
 
 
136
 
137
  tokenizer_json_file = dir_model / 'tokenizer.json'
138
  if not tokenizer_json_file.is_file():
 
175
  text = bytearray(pad_token)
176
 
177
  tokens.append(text)
 
 
178
 
179
  gguf_writer.add_token_list(tokens)
 
 
180
 
181
  special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
182
  special_vocab.add_to_gguf(gguf_writer)
convert-starcoder-hf-to-gguf.py CHANGED
@@ -117,8 +117,6 @@ gguf_writer.add_file_type(ftype)
117
  print("gguf: get tokenizer metadata")
118
 
119
  tokens: list[bytearray] = []
120
- scores: list[float] = []
121
- toktypes: list[int] = []
122
 
123
  tokenizer_json_file = dir_model / 'tokenizer.json'
124
  if not tokenizer_json_file.is_file():
@@ -161,12 +159,8 @@ for i in range(vocab_size):
161
  text = bytearray(pad_token)
162
 
163
  tokens.append(text)
164
- scores.append(0.0) # dymmy
165
- toktypes.append(gguf.TokenType.NORMAL) # dummy
166
 
167
  gguf_writer.add_token_list(tokens)
168
- gguf_writer.add_token_scores(scores)
169
- gguf_writer.add_token_types(toktypes)
170
 
171
  special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
172
  special_vocab.add_to_gguf(gguf_writer)
 
117
  print("gguf: get tokenizer metadata")
118
 
119
  tokens: list[bytearray] = []
 
 
120
 
121
  tokenizer_json_file = dir_model / 'tokenizer.json'
122
  if not tokenizer_json_file.is_file():
 
159
  text = bytearray(pad_token)
160
 
161
  tokens.append(text)
 
 
162
 
163
  gguf_writer.add_token_list(tokens)
 
 
164
 
165
  special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
166
  special_vocab.add_to_gguf(gguf_writer)
convert.py CHANGED
@@ -439,7 +439,7 @@ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
439
  def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
440
  #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
441
  if n_head_kv is not None and n_head != n_head_kv:
442
- n_head //= n_head_kv
443
  return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
444
  .swapaxes(1, 2)
445
  .reshape(weights.shape))
 
439
  def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
440
  #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
441
  if n_head_kv is not None and n_head != n_head_kv:
442
+ n_head = n_head_kv
443
  return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
444
  .swapaxes(1, 2)
445
  .reshape(weights.shape))
examples/CMakeLists.txt CHANGED
@@ -21,9 +21,12 @@ else()
21
  add_subdirectory(benchmark)
22
  add_subdirectory(baby-llama)
23
  add_subdirectory(train-text-from-scratch)
 
24
  add_subdirectory(convert-llama2c-to-ggml)
25
  add_subdirectory(simple)
 
26
  add_subdirectory(speculative)
 
27
  add_subdirectory(embd-input)
28
  add_subdirectory(llama-bench)
29
  add_subdirectory(beam-search)
@@ -33,4 +36,5 @@ else()
33
  if (LLAMA_BUILD_SERVER)
34
  add_subdirectory(server)
35
  endif()
 
36
  endif()
 
21
  add_subdirectory(benchmark)
22
  add_subdirectory(baby-llama)
23
  add_subdirectory(train-text-from-scratch)
24
+ add_subdirectory(finetune)
25
  add_subdirectory(convert-llama2c-to-ggml)
26
  add_subdirectory(simple)
27
+ add_subdirectory(batched)
28
  add_subdirectory(speculative)
29
+ add_subdirectory(parallel)
30
  add_subdirectory(embd-input)
31
  add_subdirectory(llama-bench)
32
  add_subdirectory(beam-search)
 
36
  if (LLAMA_BUILD_SERVER)
37
  add_subdirectory(server)
38
  endif()
39
+ add_subdirectory(export-lora)
40
  endif()
examples/baby-llama/baby-llama.cpp CHANGED
@@ -1,8 +1,12 @@
1
  #include "ggml.h"
 
 
2
  #include <vector>
3
  #include <cassert>
4
- #include <random>
5
  #include <cstring>
 
 
6
 
7
  #if defined(_MSC_VER)
8
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -14,31 +18,6 @@ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
14
  constexpr float rms_norm_eps = 5e-6f;
15
  #endif
16
 
17
- static float frand() {
18
- return (float)rand()/(float)RAND_MAX;
19
- }
20
-
21
- struct random_normal_distribution {
22
- std::mt19937 gen;
23
- std::normal_distribution<float> nd;
24
- float min;
25
- float max;
26
- };
27
-
28
- static void init_random_normal_distribution(
29
- struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max
30
- ) {
31
- rnd->gen = std::mt19937(seed);
32
- rnd->nd = std::normal_distribution<float>{mean, std};
33
- rnd->min = min;
34
- rnd->max = max;
35
- }
36
-
37
- static float frand_normal(struct random_normal_distribution * rnd) {
38
- const float r = rnd->nd(rnd->gen);
39
- return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
40
- }
41
-
42
  static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
43
  struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
44
 
@@ -88,55 +67,7 @@ static struct ggml_tensor * randomize_tensor(
88
  break;
89
  default:
90
  assert(false);
91
- };
92
-
93
- return tensor;
94
- }
95
-
96
- static struct ggml_tensor * randomize_tensor_normal(
97
- struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd
98
- ) {
99
- float scale = 1.0; // xavier
100
- switch (ndims) {
101
- case 1:
102
- scale /= sqrtf(ne[0]);
103
- for (int i0 = 0; i0 < ne[0]; i0++) {
104
- ((float *)tensor->data)[i0] = scale * frand_normal(rnd);
105
- }
106
- break;
107
- case 2:
108
- scale /= sqrtf(ne[0]+ne[1]);
109
- for (int i1 = 0; i1 < ne[1]; i1++) {
110
- for (int i0 = 0; i0 < ne[0]; i0++) {
111
- ((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
112
- }
113
- }
114
- break;
115
- case 3:
116
- scale /= sqrtf(ne[0]+ne[1]);
117
- for (int i2 = 0; i2 < ne[2]; i2++) {
118
- for (int i1 = 0; i1 < ne[1]; i1++) {
119
- for (int i0 = 0; i0 < ne[0]; i0++) {
120
- ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
121
- }
122
- }
123
- }
124
- break;
125
- case 4:
126
- scale /= sqrtf(ne[0]+ne[1]);
127
- for (int i3 = 0; i3 < ne[3]; i3++) {
128
- for (int i2 = 0; i2 < ne[2]; i2++) {
129
- for (int i1 = 0; i1 < ne[1]; i1++) {
130
- for (int i0 = 0; i0 < ne[0]; i0++) {
131
- ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
132
- }
133
- }
134
- }
135
- }
136
- break;
137
- default:
138
- assert(false);
139
- };
140
 
141
  return tensor;
142
  }
@@ -398,27 +329,29 @@ static void randomize_model(struct llama_model * model, int seed, float mean, fl
398
 
399
  const uint32_t n_layer = hparams.n_layer;
400
 
401
- struct random_normal_distribution rnd;
402
- init_random_normal_distribution(&rnd, seed, mean, std, min, max);
403
- randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
404
- randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd);
405
- randomize_tensor_normal(model->output, model->output->n_dims, model->output->ne, &rnd);
406
 
407
  for (uint32_t i = 0; i < n_layer; ++i) {
408
  auto & layer = model->layers[i];
409
- randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
410
 
411
- randomize_tensor_normal(layer.wq, layer.wq->n_dims, layer.wq->ne, &rnd);
412
- randomize_tensor_normal(layer.wk, layer.wk->n_dims, layer.wk->ne, &rnd);
413
- randomize_tensor_normal(layer.wv, layer.wv->n_dims, layer.wv->ne, &rnd);
414
- randomize_tensor_normal(layer.wo, layer.wo->n_dims, layer.wo->ne, &rnd);
415
 
416
- randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
417
 
418
- randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
419
- randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
420
- randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
421
  }
 
 
422
  }
423
 
424
 
@@ -429,35 +362,37 @@ static void randomize_model_lora(
429
 
430
  const uint32_t n_layer = hparams.n_layer;
431
 
432
- struct random_normal_distribution rnd;
433
- init_random_normal_distribution(&rnd, seed, mean, std, min, max);
434
- randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
435
- randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd);
436
- randomize_tensor_normal(model->outputa, model->outputa->n_dims, model->outputa->ne, &rnd);
437
- randomize_tensor_normal(model->outputb, model->outputb->n_dims, model->outputb->ne, &rnd);
438
 
439
  for (uint32_t i = 0; i < n_layer; ++i) {
440
  auto & layer = model->layers[i];
441
- randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
442
-
443
- randomize_tensor_normal(layer.wqa, layer.wqa->n_dims, layer.wqa->ne, &rnd);
444
- randomize_tensor_normal(layer.wqb, layer.wqb->n_dims, layer.wqb->ne, &rnd);
445
- randomize_tensor_normal(layer.wka, layer.wka->n_dims, layer.wka->ne, &rnd);
446
- randomize_tensor_normal(layer.wkb, layer.wkb->n_dims, layer.wkb->ne, &rnd);
447
- randomize_tensor_normal(layer.wva, layer.wva->n_dims, layer.wva->ne, &rnd);
448
- randomize_tensor_normal(layer.wvb, layer.wvb->n_dims, layer.wvb->ne, &rnd);
449
- randomize_tensor_normal(layer.woa, layer.woa->n_dims, layer.woa->ne, &rnd);
450
- randomize_tensor_normal(layer.wob, layer.wob->n_dims, layer.wob->ne, &rnd);
451
-
452
- randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
453
-
454
- randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
455
- randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
456
- randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
457
  }
 
 
458
  }
459
 
460
- static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
461
  const auto & hparams = model->hparams;
462
 
463
  const uint32_t n_ctx = hparams.n_ctx;
@@ -483,14 +418,12 @@ static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * mod
483
 
484
  if (!cache->ctx) {
485
  fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
486
- return false;
487
  }
488
  }
489
 
490
  cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
491
  cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
492
-
493
- return true;
494
  }
495
 
496
  static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
@@ -554,6 +487,14 @@ static struct ggml_tensor * forward(
554
  struct ggml_tensor * kc = kv_self.k;
555
  struct ggml_tensor * vc = kv_self.v;
556
 
 
 
 
 
 
 
 
 
557
  // inpL shape [n_embd,N,1,1]
558
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
559
  for (int il = 0; il < n_layer; ++il) {
@@ -581,8 +522,8 @@ static struct ggml_tensor * forward(
581
  // wk shape [n_embd, n_embd, 1, 1]
582
  // Qcur shape [n_embd/n_head, n_head, N, 1]
583
  // Kcur shape [n_embd/n_head, n_head, N, 1]
584
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
585
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
586
 
587
  // store key and value to memory
588
  {
@@ -754,32 +695,6 @@ static struct ggml_tensor * forward(
754
  return inpL;
755
  }
756
 
757
- static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
758
- GGML_ASSERT(tensor->n_dims == 1);
759
- GGML_ASSERT(tensor->ne[0] == ne0);
760
- }
761
-
762
- static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
763
- GGML_ASSERT(tensor->n_dims == 2);
764
- GGML_ASSERT(tensor->ne[0] == ne0);
765
- GGML_ASSERT(tensor->ne[1] == ne1);
766
- }
767
-
768
- static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
769
- GGML_ASSERT(tensor->n_dims == 3);
770
- GGML_ASSERT(tensor->ne[0] == ne0);
771
- GGML_ASSERT(tensor->ne[1] == ne1);
772
- GGML_ASSERT(tensor->ne[2] == ne2);
773
- }
774
-
775
- static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
776
- GGML_ASSERT(tensor->n_dims == 4);
777
- GGML_ASSERT(tensor->ne[0] == ne0);
778
- GGML_ASSERT(tensor->ne[1] == ne1);
779
- GGML_ASSERT(tensor->ne[2] == ne2);
780
- GGML_ASSERT(tensor->ne[3] == ne3);
781
- }
782
-
783
  static struct ggml_tensor * forward_batch(
784
  struct llama_model * model,
785
  struct llama_kv_cache * cache,
@@ -808,9 +723,18 @@ static struct ggml_tensor * forward_batch(
808
  struct ggml_tensor * kc = kv_self.k;
809
  struct ggml_tensor * vc = kv_self.v;
810
 
 
 
 
 
 
 
 
 
811
  // inpL shape [n_embd,N*n_batch,1]
812
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
813
  assert_shape_2d(inpL, n_embd, N*n_batch);
 
814
  for (int il = 0; il < n_layer; ++il) {
815
  struct ggml_tensor * inpSA = inpL;
816
 
@@ -838,8 +762,8 @@ static struct ggml_tensor * forward_batch(
838
  // wk shape [n_embd, n_embd, 1, 1]
839
  // Qcur shape [n_embd/n_head, n_head, N, n_batch]
840
  // Kcur shape [n_embd/n_head, n_head, N, n_batch]
841
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
842
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
843
  assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
844
  assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
845
 
@@ -1097,6 +1021,14 @@ static struct ggml_tensor * forward_lora(
1097
  struct ggml_tensor * kc = kv_self.k;
1098
  struct ggml_tensor * vc = kv_self.v;
1099
 
 
 
 
 
 
 
 
 
1100
  // inpL shape [n_embd,N,1,1]
1101
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
1102
  for (int il = 0; il < n_layer; ++il) {
@@ -1130,7 +1062,7 @@ static struct ggml_tensor * forward_lora(
1130
  model->layers[il].wqb,
1131
  cur)),
1132
  n_embd/n_head, n_head, N),
1133
- n_past, n_rot, 0, 0);
1134
  struct ggml_tensor * Kcur = ggml_rope(ctx0,
1135
  ggml_reshape_3d(ctx0,
1136
  ggml_mul_mat(ctx0,
@@ -1139,7 +1071,7 @@ static struct ggml_tensor * forward_lora(
1139
  model->layers[il].wkb,
1140
  cur)),
1141
  n_embd/n_head, n_head, N),
1142
- n_past, n_rot, 0, 0);
1143
 
1144
  // store key and value to memory
1145
  {
 
1
  #include "ggml.h"
2
+ #include "train.h"
3
+
4
  #include <vector>
5
  #include <cassert>
6
+ #include <cstdlib>
7
  #include <cstring>
8
+ #include <random>
9
+ #include <vector>
10
 
11
  #if defined(_MSC_VER)
12
  #pragma warning(disable: 4244 4267) // possible loss of data
 
18
  constexpr float rms_norm_eps = 5e-6f;
19
  #endif
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
22
  struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
23
 
 
67
  break;
68
  default:
69
  assert(false);
70
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  return tensor;
73
  }
 
329
 
330
  const uint32_t n_layer = hparams.n_layer;
331
 
332
+ struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
333
+
334
+ randomize_tensor_normal(model->tok_embeddings , rnd);
335
+ randomize_tensor_normal(model->norm , rnd);
336
+ randomize_tensor_normal(model->output , rnd);
337
 
338
  for (uint32_t i = 0; i < n_layer; ++i) {
339
  auto & layer = model->layers[i];
340
+ randomize_tensor_normal(layer.attention_norm, rnd);
341
 
342
+ randomize_tensor_normal(layer.wq, rnd);
343
+ randomize_tensor_normal(layer.wk, rnd);
344
+ randomize_tensor_normal(layer.wv, rnd);
345
+ randomize_tensor_normal(layer.wo, rnd);
346
 
347
+ randomize_tensor_normal(layer.ffn_norm, rnd);
348
 
349
+ randomize_tensor_normal(layer.w1, rnd);
350
+ randomize_tensor_normal(layer.w2, rnd);
351
+ randomize_tensor_normal(layer.w3, rnd);
352
  }
353
+
354
+ free_random_normal_distribution(rnd);
355
  }
356
 
357
 
 
362
 
363
  const uint32_t n_layer = hparams.n_layer;
364
 
365
+ struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
366
+
367
+ randomize_tensor_normal(model->tok_embeddings, rnd);
368
+ randomize_tensor_normal(model->norm , rnd);
369
+ randomize_tensor_normal(model->outputa , rnd);
370
+ randomize_tensor_normal(model->outputb , rnd);
371
 
372
  for (uint32_t i = 0; i < n_layer; ++i) {
373
  auto & layer = model->layers[i];
374
+ randomize_tensor_normal(layer.attention_norm, rnd);
375
+
376
+ randomize_tensor_normal(layer.wqa, rnd);
377
+ randomize_tensor_normal(layer.wqb, rnd);
378
+ randomize_tensor_normal(layer.wka, rnd);
379
+ randomize_tensor_normal(layer.wkb, rnd);
380
+ randomize_tensor_normal(layer.wva, rnd);
381
+ randomize_tensor_normal(layer.wvb, rnd);
382
+ randomize_tensor_normal(layer.woa, rnd);
383
+ randomize_tensor_normal(layer.wob, rnd);
384
+
385
+ randomize_tensor_normal(layer.ffn_norm, rnd);
386
+
387
+ randomize_tensor_normal(layer.w1, rnd);
388
+ randomize_tensor_normal(layer.w2, rnd);
389
+ randomize_tensor_normal(layer.w3, rnd);
390
  }
391
+
392
+ free_random_normal_distribution(rnd);
393
  }
394
 
395
+ static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
396
  const auto & hparams = model->hparams;
397
 
398
  const uint32_t n_ctx = hparams.n_ctx;
 
418
 
419
  if (!cache->ctx) {
420
  fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
421
+ exit(1);
422
  }
423
  }
424
 
425
  cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
426
  cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
 
 
427
  }
428
 
429
  static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
 
487
  struct ggml_tensor * kc = kv_self.k;
488
  struct ggml_tensor * vc = kv_self.v;
489
 
490
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
491
+ {
492
+ int * data = (int *) KQ_pos->data;
493
+ for (int i = 0; i < N; ++i) {
494
+ data[i] = n_past + i;
495
+ }
496
+ }
497
+
498
  // inpL shape [n_embd,N,1,1]
499
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
500
  for (int il = 0; il < n_layer; ++il) {
 
522
  // wk shape [n_embd, n_embd, 1, 1]
523
  // Qcur shape [n_embd/n_head, n_head, N, 1]
524
  // Kcur shape [n_embd/n_head, n_head, N, 1]
525
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
526
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
527
 
528
  // store key and value to memory
529
  {
 
695
  return inpL;
696
  }
697
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698
  static struct ggml_tensor * forward_batch(
699
  struct llama_model * model,
700
  struct llama_kv_cache * cache,
 
723
  struct ggml_tensor * kc = kv_self.k;
724
  struct ggml_tensor * vc = kv_self.v;
725
 
726
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
727
+ {
728
+ int * data = (int *) KQ_pos->data;
729
+ for (int i = 0; i < N; ++i) {
730
+ data[i] = n_past + i;
731
+ }
732
+ }
733
+
734
  // inpL shape [n_embd,N*n_batch,1]
735
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
736
  assert_shape_2d(inpL, n_embd, N*n_batch);
737
+
738
  for (int il = 0; il < n_layer; ++il) {
739
  struct ggml_tensor * inpSA = inpL;
740
 
 
762
  // wk shape [n_embd, n_embd, 1, 1]
763
  // Qcur shape [n_embd/n_head, n_head, N, n_batch]
764
  // Kcur shape [n_embd/n_head, n_head, N, n_batch]
765
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
766
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
767
  assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
768
  assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
769
 
 
1021
  struct ggml_tensor * kc = kv_self.k;
1022
  struct ggml_tensor * vc = kv_self.v;
1023
 
1024
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1025
+ {
1026
+ int * data = (int *) KQ_pos->data;
1027
+ for (int i = 0; i < N; ++i) {
1028
+ data[i] = n_past + i;
1029
+ }
1030
+ }
1031
+
1032
  // inpL shape [n_embd,N,1,1]
1033
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
1034
  for (int il = 0; il < n_layer; ++il) {
 
1062
  model->layers[il].wqb,
1063
  cur)),
1064
  n_embd/n_head, n_head, N),
1065
+ KQ_pos, n_rot, 0, 0);
1066
  struct ggml_tensor * Kcur = ggml_rope(ctx0,
1067
  ggml_reshape_3d(ctx0,
1068
  ggml_mul_mat(ctx0,
 
1071
  model->layers[il].wkb,
1072
  cur)),
1073
  n_embd/n_head, n_head, N),
1074
+ KQ_pos, n_rot, 0, 0);
1075
 
1076
  // store key and value to memory
1077
  {
examples/batched/CMakeLists.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ set(TARGET batched)
2
+ add_executable(${TARGET} batched.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
examples/batched/README.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # llama.cpp/example/batched
2
+
3
+ The example demonstrates batched generation from a given prompt
4
+
5
+ ```bash
6
+ ./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
7
+
8
+ ...
9
+
10
+ main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113
11
+
12
+ Hello my name is
13
+
14
+ main: generating 4 sequences ...
15
+
16
+ main: stream 0 finished
17
+ main: stream 1 finished
18
+ main: stream 2 finished
19
+ main: stream 3 finished
20
+
21
+ sequence 0:
22
+
23
+ Hello my name is Shirley. I am a 25-year-old female who has been working for over 5 years as a b
24
+
25
+ sequence 1:
26
+
27
+ Hello my name is Renee and I'm a 32 year old female from the United States. I'm looking for a man between
28
+
29
+ sequence 2:
30
+
31
+ Hello my name is Diana. I am looking for a housekeeping job. I have experience with children and have my own transportation. I am
32
+
33
+ sequence 3:
34
+
35
+ Hello my name is Cody. I am a 3 year old neutered male. I am a very friendly cat. I am very playful and
36
+
37
+ main: decoded 108 tokens in 3.57 s, speed: 30.26 t/s
38
+
39
+ llama_print_timings: load time = 587.00 ms
40
+ llama_print_timings: sample time = 2.56 ms / 112 runs ( 0.02 ms per token, 43664.72 tokens per second)
41
+ llama_print_timings: prompt eval time = 4089.11 ms / 118 tokens ( 34.65 ms per token, 28.86 tokens per second)
42
+ llama_print_timings: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
43
+ llama_print_timings: total time = 4156.04 ms
44
+ ```
examples/batched/batched.cpp ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "common.h"
2
+ #include "llama.h"
3
+
4
+ #include <algorithm>
5
+ #include <cmath>
6
+ #include <cstdio>
7
+ #include <string>
8
+ #include <vector>
9
+
10
+ int main(int argc, char ** argv) {
11
+ gpt_params params;
12
+
13
+ if (argc == 1 || argv[1][0] == '-') {
14
+ printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]);
15
+ return 1 ;
16
+ }
17
+
18
+ int n_parallel = 1;
19
+
20
+ if (argc >= 2) {
21
+ params.model = argv[1];
22
+ }
23
+
24
+ if (argc >= 3) {
25
+ params.prompt = argv[2];
26
+ }
27
+
28
+ if (argc >= 4) {
29
+ n_parallel = std::atoi(argv[3]);
30
+ }
31
+
32
+ if (params.prompt.empty()) {
33
+ params.prompt = "Hello my name is";
34
+ }
35
+
36
+ // total length of the sequences including the prompt
37
+ const int n_len = 32;
38
+
39
+ // init LLM
40
+
41
+ llama_backend_init(params.numa);
42
+
43
+ // initialize the model
44
+
45
+ llama_model_params model_params = llama_model_default_params();
46
+
47
+ // model_params.n_gpu_layers = 99; // offload all layers to the GPU
48
+
49
+ llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
50
+
51
+ if (model == NULL) {
52
+ fprintf(stderr , "%s: error: unable to load model\n" , __func__);
53
+ return 1;
54
+ }
55
+
56
+ // tokenize the prompt
57
+
58
+ std::vector<llama_token> tokens_list;
59
+ tokens_list = ::llama_tokenize(model, params.prompt, true);
60
+ const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
61
+
62
+ // initialize the context
63
+
64
+ llama_context_params ctx_params = llama_context_default_params();
65
+
66
+ ctx_params.seed = 1234;
67
+ ctx_params.n_ctx = n_kv_req;
68
+ ctx_params.n_batch = std::max(n_len, n_parallel);
69
+ ctx_params.n_threads = params.n_threads;
70
+ ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
71
+
72
+ llama_context * ctx = llama_new_context_with_model(model, ctx_params);
73
+
74
+ if (ctx == NULL) {
75
+ fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
76
+ return 1;
77
+ }
78
+
79
+ const int n_ctx = llama_n_ctx(ctx);
80
+
81
+ LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
82
+
83
+ // make sure the KV cache is big enough to hold all the prompt and generated tokens
84
+ if (n_kv_req > n_ctx) {
85
+ LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
86
+ LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
87
+ return 1;
88
+ }
89
+
90
+ // print the prompt token-by-token
91
+
92
+ fprintf(stderr, "\n");
93
+
94
+ for (auto id : tokens_list) {
95
+ fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
96
+ }
97
+
98
+ fflush(stderr);
99
+
100
+ // create a llama_batch with size 512
101
+ // we use this object to submit token data for decoding
102
+
103
+ llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0);
104
+
105
+ // evaluate the initial prompt
106
+ batch.n_tokens = tokens_list.size();
107
+
108
+ for (int32_t i = 0; i < batch.n_tokens; i++) {
109
+ batch.token[i] = tokens_list[i];
110
+ batch.pos[i] = i;
111
+ batch.seq_id[i] = 0;
112
+ batch.logits[i] = false;
113
+ }
114
+
115
+ // llama_decode will output logits only for the last token of the prompt
116
+ batch.logits[batch.n_tokens - 1] = true;
117
+
118
+ if (llama_decode(ctx, batch) != 0) {
119
+ LOG_TEE("%s: llama_decode() failed\n", __func__);
120
+ return 1;
121
+ }
122
+
123
+ // assign the system KV cache to all parallel sequences
124
+ // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
125
+ for (int32_t i = 1; i < n_parallel; ++i) {
126
+ llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens);
127
+ }
128
+
129
+ if (n_parallel > 1) {
130
+ LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
131
+ }
132
+
133
+ // main loop
134
+
135
+ // we will store the parallel decoded sequences in this vector
136
+ std::vector<std::string> streams(n_parallel);
137
+
138
+ // remember the batch index of the last token for each parallel sequence
139
+ // we need this to determine which logits to sample from
140
+ std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
141
+
142
+ int n_cur = batch.n_tokens;
143
+ int n_decode = 0;
144
+
145
+ const auto t_main_start = ggml_time_us();
146
+
147
+ while (n_cur <= n_len) {
148
+ // prepare the next batch
149
+ batch.n_tokens = 0;
150
+
151
+ // sample the next token for each parallel sequence / stream
152
+ for (int32_t i = 0; i < n_parallel; ++i) {
153
+ if (i_batch[i] < 0) {
154
+ // the stream has already finished
155
+ continue;
156
+ }
157
+
158
+ auto n_vocab = llama_n_vocab(model);
159
+ auto * logits = llama_get_logits_ith(ctx, i_batch[i]);
160
+
161
+ std::vector<llama_token_data> candidates;
162
+ candidates.reserve(n_vocab);
163
+
164
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
165
+ candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
166
+ }
167
+
168
+ llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
169
+
170
+ const int top_k = 40;
171
+ const float top_p = 0.9f;
172
+ const float temp = 0.4f;
173
+
174
+ llama_sample_top_k(ctx, &candidates_p, top_k, 1);
175
+ llama_sample_top_p(ctx, &candidates_p, top_p, 1);
176
+ llama_sample_temp (ctx, &candidates_p, temp);
177
+
178
+ const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
179
+
180
+ //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
181
+
182
+ // is it an end of stream? -> mark the stream as finished
183
+ if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
184
+ i_batch[i] = -1;
185
+ LOG_TEE("\n");
186
+ if (n_parallel > 1) {
187
+ LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
188
+ }
189
+
190
+ continue;
191
+ }
192
+
193
+ // if there is only one stream, we print immediately to stdout
194
+ if (n_parallel == 1) {
195
+ LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
196
+ fflush(stdout);
197
+ }
198
+
199
+ streams[i] += llama_token_to_piece(ctx, new_token_id);
200
+
201
+ // push this new token for next evaluation
202
+ batch.token [batch.n_tokens] = new_token_id;
203
+ batch.pos [batch.n_tokens] = n_cur;
204
+ batch.seq_id[batch.n_tokens] = i;
205
+ batch.logits[batch.n_tokens] = true;
206
+
207
+ i_batch[i] = batch.n_tokens;
208
+
209
+ batch.n_tokens += 1;
210
+
211
+ n_decode += 1;
212
+ }
213
+
214
+ // all streams are finished
215
+ if (batch.n_tokens == 0) {
216
+ break;
217
+ }
218
+
219
+ n_cur += 1;
220
+
221
+ // evaluate the current batch with the transformer model
222
+ if (llama_decode(ctx, batch)) {
223
+ fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
224
+ return 1;
225
+ }
226
+ }
227
+
228
+ LOG_TEE("\n");
229
+
230
+ if (n_parallel > 1) {
231
+ LOG_TEE("\n");
232
+
233
+ for (int32_t i = 0; i < n_parallel; ++i) {
234
+ LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
235
+ }
236
+ }
237
+
238
+ const auto t_main_end = ggml_time_us();
239
+
240
+ LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
241
+ __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
242
+
243
+ llama_print_timings(ctx);
244
+
245
+ fprintf(stderr, "\n");
246
+
247
+ llama_batch_free(batch);
248
+
249
+ llama_free(ctx);
250
+ llama_free_model(model);
251
+
252
+ llama_backend_free();
253
+
254
+ return 0;
255
+ }
examples/beam-search/beam-search.cpp CHANGED
@@ -158,8 +158,9 @@ int main(int argc, char ** argv)
158
  }
159
  std::cout << std::flush;
160
 
161
- int n_past = llama_get_kv_cache_token_count(ctx);
162
- if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
 
163
  {
164
  fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
165
  return 1;
@@ -169,7 +170,7 @@ int main(int argc, char ** argv)
169
  beam_search_callback_data callback_data{ctx, {}};
170
  size_t const beam_width = static_cast<size_t>(params.n_beams);
171
  int const n_predict = 256;
172
- llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads);
173
 
174
  std::cout << "\n\n";
175
  for (llama_token const token_id : callback_data.response) {
 
158
  }
159
  std::cout << std::flush;
160
 
161
+ int n_past = 0;
162
+
163
+ if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
164
  {
165
  fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
166
  return 1;
 
170
  beam_search_callback_data callback_data{ctx, {}};
171
  size_t const beam_width = static_cast<size_t>(params.n_beams);
172
  int const n_predict = 256;
173
+ llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
174
 
175
  std::cout << "\n\n";
176
  for (llama_token const token_id : callback_data.response) {
examples/embd-input/embd-input-lib.cpp CHANGED
@@ -48,8 +48,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
48
  // print system information
49
  {
50
  fprintf(stderr, "\n");
51
- fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
52
- params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
53
  }
54
  struct MyModel * ret = new MyModel();
55
  ret->ctx = ctx;
@@ -71,7 +70,7 @@ bool eval_float(void * model, float * input, int N){
71
  MyModel * mymodel = (MyModel*)model;
72
  llama_context * ctx = mymodel->ctx;
73
  gpt_params params = mymodel->params;
74
- int n_emb = llama_n_embd(ctx);
75
  int n_past = mymodel->n_past;
76
  int n_batch = N; // params.n_batch;
77
 
@@ -80,7 +79,8 @@ bool eval_float(void * model, float * input, int N){
80
  if (n_eval > n_batch) {
81
  n_eval = n_batch;
82
  }
83
- if (llama_eval_embd(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) {
 
84
  fprintf(stderr, "%s : failed to eval\n", __func__);
85
  return false;
86
  }
@@ -101,7 +101,7 @@ bool eval_tokens(void * model, std::vector<llama_token> tokens) {
101
  if (n_eval > params.n_batch) {
102
  n_eval = params.n_batch;
103
  }
104
- if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) {
105
  fprintf(stderr, "%s : failed to eval\n", __func__);
106
  return false;
107
  }
@@ -132,7 +132,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
132
 
133
  // out of user input, sample next token
134
  const float temp = params.temp;
135
- const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
136
  const float top_p = params.top_p;
137
  const float tfs_z = params.tfs_z;
138
  const float typical_p = params.typical_p;
@@ -148,7 +148,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
148
  llama_token id = 0;
149
  {
150
  auto logits = llama_get_logits(ctx);
151
- auto n_vocab = llama_n_vocab(ctx);
152
 
153
  // Apply params.logit_bias map
154
  for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
@@ -183,11 +183,11 @@ llama_token sampling_id(struct MyModel* mymodel) {
183
  if (mirostat == 1) {
184
  static float mirostat_mu = 2.0f * mirostat_tau;
185
  const int mirostat_m = 100;
186
- llama_sample_temperature(ctx, &candidates_p, temp);
187
  id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
188
  } else if (mirostat == 2) {
189
  static float mirostat_mu = 2.0f * mirostat_tau;
190
- llama_sample_temperature(ctx, &candidates_p, temp);
191
  id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
192
  } else {
193
  // Temperature sampling
@@ -195,7 +195,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
195
  llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
196
  llama_sample_typical(ctx, &candidates_p, typical_p, 1);
197
  llama_sample_top_p(ctx, &candidates_p, top_p, 1);
198
- llama_sample_temperature(ctx, &candidates_p, temp);
199
  id = llama_sample_token(ctx, &candidates_p);
200
  }
201
  }
 
48
  // print system information
49
  {
50
  fprintf(stderr, "\n");
51
+ fprintf(stderr, "%s\n", get_system_info(params).c_str());
 
52
  }
53
  struct MyModel * ret = new MyModel();
54
  ret->ctx = ctx;
 
70
  MyModel * mymodel = (MyModel*)model;
71
  llama_context * ctx = mymodel->ctx;
72
  gpt_params params = mymodel->params;
73
+ int n_emb = llama_n_embd(llama_get_model(ctx));
74
  int n_past = mymodel->n_past;
75
  int n_batch = N; // params.n_batch;
76
 
 
79
  if (n_eval > n_batch) {
80
  n_eval = n_batch;
81
  }
82
+ llama_batch batch = { int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
83
+ if (llama_decode(ctx, batch)) {
84
  fprintf(stderr, "%s : failed to eval\n", __func__);
85
  return false;
86
  }
 
101
  if (n_eval > params.n_batch) {
102
  n_eval = params.n_batch;
103
  }
104
+ if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
105
  fprintf(stderr, "%s : failed to eval\n", __func__);
106
  return false;
107
  }
 
132
 
133
  // out of user input, sample next token
134
  const float temp = params.temp;
135
+ const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k;
136
  const float top_p = params.top_p;
137
  const float tfs_z = params.tfs_z;
138
  const float typical_p = params.typical_p;
 
148
  llama_token id = 0;
149
  {
150
  auto logits = llama_get_logits(ctx);
151
+ auto n_vocab = llama_n_vocab(llama_get_model(ctx));
152
 
153
  // Apply params.logit_bias map
154
  for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
 
183
  if (mirostat == 1) {
184
  static float mirostat_mu = 2.0f * mirostat_tau;
185
  const int mirostat_m = 100;
186
+ llama_sample_temp(ctx, &candidates_p, temp);
187
  id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
188
  } else if (mirostat == 2) {
189
  static float mirostat_mu = 2.0f * mirostat_tau;
190
+ llama_sample_temp(ctx, &candidates_p, temp);
191
  id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
192
  } else {
193
  // Temperature sampling
 
195
  llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
196
  llama_sample_typical(ctx, &candidates_p, typical_p, 1);
197
  llama_sample_top_p(ctx, &candidates_p, top_p, 1);
198
+ llama_sample_temp(ctx, &candidates_p, temp);
199
  id = llama_sample_token(ctx, &candidates_p);
200
  }
201
  }
examples/embd-input/embd-input-test.cpp CHANGED
@@ -8,7 +8,7 @@ int main(int argc, char** argv) {
8
  auto mymodel = create_mymodel(argc, argv);
9
  int N = 10;
10
  int max_tgt_len = 500;
11
- int n_embd = llama_n_embd(mymodel->ctx);
12
 
13
  // add random float embd to test evaluation
14
  float * data = new float[N*n_embd];
 
8
  auto mymodel = create_mymodel(argc, argv);
9
  int N = 10;
10
  int max_tgt_len = 500;
11
+ int n_embd = llama_n_embd(llama_get_model(mymodel->ctx));
12
 
13
  // add random float embd to test evaluation
14
  float * data = new float[N*n_embd];
examples/embedding/embedding.cpp CHANGED
@@ -42,17 +42,18 @@ int main(int argc, char ** argv) {
42
  return 1;
43
  }
44
 
45
- const int n_ctx_train = llama_n_ctx_train(ctx);
46
- if (params.n_ctx > n_ctx_train) {
 
 
47
  fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
48
- __func__, n_ctx_train, params.n_ctx);
49
  }
50
 
51
  // print system information
52
  {
53
  fprintf(stderr, "\n");
54
- fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
55
- params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
56
  }
57
 
58
  int n_past = 0;
@@ -70,15 +71,15 @@ int main(int argc, char ** argv) {
70
  fprintf(stderr, "\n");
71
  }
72
 
73
- if (embd_inp.size() > (size_t)params.n_ctx) {
74
  fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n",
75
- __func__, embd_inp.size(), params.n_ctx);
76
  return 1;
77
  }
78
 
79
  while (!embd_inp.empty()) {
80
  int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
81
- if (llama_eval(ctx, embd_inp.data(), n_tokens, n_past, params.n_threads)) {
82
  fprintf(stderr, "%s : failed to eval\n", __func__);
83
  return 1;
84
  }
@@ -86,8 +87,8 @@ int main(int argc, char ** argv) {
86
  embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
87
  }
88
 
89
- const int n_embd = llama_n_embd(ctx);
90
- const auto embeddings = llama_get_embeddings(ctx);
91
 
92
  for (int i = 0; i < n_embd; i++) {
93
  printf("%f ", embeddings[i]);
 
42
  return 1;
43
  }
44
 
45
+ const int n_ctx_train = llama_n_ctx_train(model);
46
+ const int n_ctx = llama_n_ctx(ctx);
47
+
48
+ if (n_ctx > n_ctx_train) {
49
  fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
50
+ __func__, n_ctx_train, n_ctx);
51
  }
52
 
53
  // print system information
54
  {
55
  fprintf(stderr, "\n");
56
+ fprintf(stderr, "%s\n", get_system_info(params).c_str());
 
57
  }
58
 
59
  int n_past = 0;
 
71
  fprintf(stderr, "\n");
72
  }
73
 
74
+ if (embd_inp.size() > (size_t)n_ctx) {
75
  fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n",
76
+ __func__, embd_inp.size(), n_ctx);
77
  return 1;
78
  }
79
 
80
  while (!embd_inp.empty()) {
81
  int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
82
+ if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0))) {
83
  fprintf(stderr, "%s : failed to eval\n", __func__);
84
  return 1;
85
  }
 
87
  embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
88
  }
89
 
90
+ const int n_embd = llama_n_embd(model);
91
+ const auto * embeddings = llama_get_embeddings(ctx);
92
 
93
  for (int i = 0; i < n_embd; i++) {
94
  printf("%f ", embeddings[i]);
examples/export-lora/CMakeLists.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ set(TARGET export-lora)
2
+ add_executable(${TARGET} export-lora.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
examples/export-lora/README.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # export-lora
2
+
3
+ Apply LORA adapters to base model and export the resulting model.
4
+
5
+ ```
6
+ usage: export-lora [options]
7
+
8
+ options:
9
+ -h, --help show this help message and exit
10
+ -m FNAME, --model-base FNAME model path from which to load base model (default '')
11
+ -o FNAME, --model-out FNAME path to save exported model (default '')
12
+ -l FNAME, --lora FNAME apply LoRA adapter
13
+ -s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S
14
+ -t N, --threads N number of threads to use during computation (default: 4)
15
+ ```
16
+
17
+ For example:
18
+
19
+ ```bash
20
+ ./bin/export-lora \
21
+ -m open-llama-3b-v2-q8_0.gguf \
22
+ -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
23
+ -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
24
+ ```
25
+
26
+ Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters.
examples/export-lora/export-lora.cpp ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ #include "common.h"
3
+ #include "ggml.h"
4
+ #include "ggml-alloc.h"
5
+
6
+ #include <vector>
7
+ #include <string>
8
+ #include <thread>
9
+
10
+ static const size_t tensor_alignment = 32;
11
+
12
+ struct lora_info {
13
+ std::string filename;
14
+ float scale;
15
+ };
16
+
17
+ struct export_lora_params {
18
+ std::string fn_model_base;
19
+ std::string fn_model_out;
20
+ std::vector<struct lora_info> lora;
21
+ int n_threads;
22
+ };
23
+
24
+ struct lora_data {
25
+ struct lora_info info;
26
+ std::vector<uint8_t> data;
27
+ struct ggml_context * ctx;
28
+
29
+ uint32_t lora_r;
30
+ uint32_t lora_alpha;
31
+ };
32
+
33
+ struct llama_file {
34
+ // use FILE * so we don't have to re-open the file to mmap
35
+ FILE * fp;
36
+ size_t size;
37
+
38
+ llama_file(const char * fname, const char * mode) {
39
+ fp = std::fopen(fname, mode);
40
+ if (fp == NULL) {
41
+ size = 0;
42
+ } else {
43
+ seek(0, SEEK_END);
44
+ size = tell();
45
+ seek(0, SEEK_SET);
46
+ }
47
+ }
48
+
49
+ size_t tell() const {
50
+ #ifdef _WIN32
51
+ __int64 ret = _ftelli64(fp);
52
+ #else
53
+ long ret = std::ftell(fp);
54
+ #endif
55
+ GGML_ASSERT(ret != -1); // this really shouldn't fail
56
+ return (size_t) ret;
57
+ }
58
+
59
+ void seek(size_t offset, int whence) {
60
+ #ifdef _WIN32
61
+ int ret = _fseeki64(fp, (__int64) offset, whence);
62
+ #else
63
+ int ret = std::fseek(fp, (long) offset, whence);
64
+ #endif
65
+ GGML_ASSERT(ret == 0); // same
66
+ }
67
+
68
+ void read_raw(void * ptr, size_t size) {
69
+ if (size == 0) {
70
+ return;
71
+ }
72
+ errno = 0;
73
+ std::size_t ret = std::fread(ptr, size, 1, fp);
74
+ if (ferror(fp)) {
75
+ die_fmt("read error: %s", strerror(errno));
76
+ }
77
+ if (ret != 1) {
78
+ die("unexpectedly reached end of file");
79
+ }
80
+ }
81
+
82
+ std::uint32_t read_u32() {
83
+ std::uint32_t ret;
84
+ read_raw(&ret, sizeof(ret));
85
+ return ret;
86
+ }
87
+
88
+ std::string read_string(std::uint32_t len) {
89
+ std::vector<char> chars(len);
90
+ read_raw(chars.data(), len);
91
+ return std::string(chars.data(), len);
92
+ }
93
+
94
+ void write_raw(const void * ptr, size_t size) {
95
+ if (size == 0) {
96
+ return;
97
+ }
98
+ errno = 0;
99
+ size_t ret = std::fwrite(ptr, size, 1, fp);
100
+ if (ret != 1) {
101
+ die_fmt("write error: %s", strerror(errno));
102
+ }
103
+ }
104
+
105
+ void write_u32(std::uint32_t val) {
106
+ write_raw(&val, sizeof(val));
107
+ }
108
+
109
+ bool eof() {
110
+ return tell() >= size;
111
+ }
112
+
113
+ ~llama_file() {
114
+ if (fp) {
115
+ std::fclose(fp);
116
+ }
117
+ }
118
+ };
119
+
120
+ static struct export_lora_params get_default_export_lora_params() {
121
+ struct export_lora_params result;
122
+ result.fn_model_base = "";
123
+ result.fn_model_out = "";
124
+ result.n_threads = GGML_DEFAULT_N_THREADS;
125
+ return result;
126
+ }
127
+
128
+ static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
129
+ fprintf(stderr, "usage: %s [options]\n", argv[0]);
130
+ fprintf(stderr, "\n");
131
+ fprintf(stderr, "options:\n");
132
+ fprintf(stderr, " -h, --help show this help message and exit\n");
133
+ fprintf(stderr, " -m FNAME, --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
134
+ fprintf(stderr, " -o FNAME, --model-out FNAME path to save exported model (default '%s')\n", params->fn_model_out.c_str());
135
+ fprintf(stderr, " -l FNAME, --lora FNAME apply LoRA adapter\n");
136
+ fprintf(stderr, " -s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S\n");
137
+ fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params->n_threads);
138
+ }
139
+
140
+ static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
141
+ bool invalid_param = false;
142
+ std::string arg;
143
+ struct export_lora_params default_params = get_default_export_lora_params();
144
+ const std::string arg_prefix = "--";
145
+
146
+ for (int i = 1; i < argc; i++) {
147
+ arg = argv[i];
148
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
149
+ std::replace(arg.begin(), arg.end(), '_', '-');
150
+ }
151
+
152
+ if (arg == "-m" || arg == "--model-base") {
153
+ if (++i >= argc) {
154
+ invalid_param = true;
155
+ break;
156
+ }
157
+ params->fn_model_base = argv[i];
158
+ } else if (arg == "-o" || arg == "--model-out") {
159
+ if (++i >= argc) {
160
+ invalid_param = true;
161
+ break;
162
+ }
163
+ params->fn_model_out = argv[i];
164
+ } else if (arg == "-l" || arg == "--lora") {
165
+ if (++i >= argc) {
166
+ invalid_param = true;
167
+ break;
168
+ }
169
+ struct lora_info lora;
170
+ lora.filename = argv[i];
171
+ lora.scale = 1.0f;
172
+ params->lora.push_back(lora);
173
+ } else if (arg == "-s" || arg == "--lora-scaled") {
174
+ if (++i >= argc) {
175
+ invalid_param = true;
176
+ break;
177
+ }
178
+ struct lora_info lora;
179
+ lora.filename = argv[i];
180
+ if (++i >= argc) {
181
+ invalid_param = true;
182
+ break;
183
+ }
184
+ lora.scale = std::stof(argv[i]);
185
+ params->lora.push_back(lora);
186
+ } else if (arg == "-t" || arg == "--threads") {
187
+ if (++i >= argc) {
188
+ invalid_param = true;
189
+ break;
190
+ }
191
+ params->n_threads = std::stoi(argv[i]);
192
+ if (params->n_threads <= 0) {
193
+ params->n_threads = std::thread::hardware_concurrency();
194
+ }
195
+ } else {
196
+ fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
197
+ export_lora_print_usage(argc, argv, &default_params);
198
+ exit(1);
199
+ }
200
+ }
201
+
202
+ if (params->fn_model_base == default_params.fn_model_base) {
203
+ fprintf(stderr, "error: please specify a filename for model-base.\n");
204
+ export_lora_print_usage(argc, argv, &default_params);
205
+ exit(1);
206
+ }
207
+ if (params->fn_model_out == default_params.fn_model_out) {
208
+ fprintf(stderr, "error: please specify a filename for model-out.\n");
209
+ export_lora_print_usage(argc, argv, &default_params);
210
+ exit(1);
211
+ }
212
+ if (invalid_param) {
213
+ fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
214
+ export_lora_print_usage(argc, argv, &default_params);
215
+ exit(1);
216
+ }
217
+ return true;
218
+ }
219
+
220
+ static void free_lora(struct lora_data * lora) {
221
+ if (lora->ctx != NULL) {
222
+ ggml_free(lora->ctx);
223
+ }
224
+ delete lora;
225
+ }
226
+
227
+ static struct lora_data * load_lora(struct lora_info * info) {
228
+ struct lora_data * result = new struct lora_data;
229
+ result->info = *info;
230
+ result->ctx = NULL;
231
+ result->lora_r = 1;
232
+ result->lora_alpha = 1;
233
+
234
+ struct llama_file file(info->filename.c_str(), "rb");
235
+ if (file.fp == NULL) {
236
+ fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
237
+ info->filename.c_str());
238
+ free_lora(result);
239
+ return NULL;
240
+ }
241
+
242
+ struct ggml_init_params params_ggml;
243
+ params_ggml.mem_size = ggml_tensor_overhead() * GGML_MAX_NODES;
244
+ params_ggml.mem_buffer = NULL;
245
+ params_ggml.no_alloc = true;
246
+ result->ctx = ggml_init(params_ggml);
247
+
248
+ uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
249
+ uint32_t magic = file.read_u32();
250
+ if (magic != LLAMA_FILE_MAGIC_LORA) {
251
+ die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
252
+ }
253
+ uint32_t version = file.read_u32();
254
+ if (version != 1) {
255
+ die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
256
+ }
257
+ result->lora_r = file.read_u32();
258
+ result->lora_alpha = file.read_u32();
259
+ // read tensor infos from file
260
+ std::vector<char> name_buf;
261
+ std::vector<struct ggml_tensor *> tensors;
262
+ std::vector<size_t> tensors_offset;
263
+ size_t total_nbytes_pad = 0;
264
+ while(!file.eof()) {
265
+ int64_t ne[4] = {1,1,1,1};
266
+ uint32_t n_dims = file.read_u32();
267
+ uint32_t namelen = file.read_u32();
268
+ uint32_t type = file.read_u32();
269
+ for (uint32_t k = 0; k < n_dims; ++k) {
270
+ ne[k] = (int64_t)file.read_u32();
271
+ }
272
+ name_buf.clear();
273
+ name_buf.resize(namelen + 1, '\0');
274
+ file.read_raw(name_buf.data(), namelen);
275
+ file.seek((0-file.tell()) & 31, SEEK_CUR);
276
+ size_t offset = file.tell();
277
+ struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
278
+ ggml_set_name(tensor, name_buf.data());
279
+ size_t nbytes = ggml_nbytes(tensor);
280
+ size_t nbytes_pad = ggml_nbytes_pad(tensor);
281
+ total_nbytes_pad += nbytes_pad;
282
+ tensors.push_back(tensor);
283
+ tensors_offset.push_back(offset);
284
+ file.seek(nbytes, SEEK_CUR);
285
+ }
286
+ // read tensor data
287
+ result->data.resize(total_nbytes_pad);
288
+ size_t data_offset = 0;
289
+ for (size_t i = 0; i < tensors.size(); ++i) {
290
+ struct ggml_tensor * tensor = tensors[i];
291
+ size_t offset = tensors_offset[i];
292
+ size_t nbytes = ggml_nbytes(tensor);
293
+ size_t nbytes_pad = ggml_nbytes_pad(tensor);
294
+ file.seek(offset, SEEK_SET);
295
+ tensor->data = result->data.data() + data_offset;
296
+ file.read_raw(tensor->data, nbytes);
297
+ data_offset += nbytes_pad;
298
+ }
299
+ return result;
300
+ }
301
+
302
+
303
+ static struct ggml_cgraph * build_graph_lora(
304
+ struct ggml_context * ctx,
305
+ struct ggml_tensor * tensor,
306
+ struct ggml_tensor * lora_a,
307
+ struct ggml_tensor * lora_b,
308
+ float scaling
309
+ ) {
310
+ struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
311
+ if (scaling != 1.0f) {
312
+ ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling));
313
+ }
314
+ struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
315
+
316
+ struct ggml_cgraph * gf = ggml_new_graph(ctx);
317
+ ggml_build_forward_expand (gf, res);
318
+ return gf;
319
+ }
320
+
321
+ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
322
+ if (lora->ctx == NULL) {
323
+ return false;
324
+ }
325
+ std::string name = ggml_get_name(tensor);
326
+ std::string name_a = name + std::string(".loraA");
327
+ std::string name_b = name + std::string(".loraB");
328
+ struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
329
+ struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
330
+ if (lora_a == NULL || lora_b == NULL) {
331
+ return false;
332
+ }
333
+
334
+ float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
335
+
336
+ struct ggml_init_params params;
337
+ params.mem_size = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
338
+ params.mem_buffer = NULL;
339
+ params.no_alloc = true;
340
+ struct ggml_context * ctx = NULL;
341
+ struct ggml_allocr * alloc = NULL;
342
+ struct ggml_cgraph * gf = NULL;
343
+
344
+ ctx = ggml_init(params);
345
+ alloc = ggml_allocr_new_measure(tensor_alignment);
346
+ gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
347
+ size_t alloc_size = ggml_allocr_alloc_graph(alloc, gf);
348
+ ggml_allocr_free(alloc);
349
+ ggml_free(ctx);
350
+
351
+ static std::vector<uint8_t> data_compute;
352
+ data_compute.resize(alloc_size + tensor_alignment);
353
+
354
+ ctx = ggml_init(params);
355
+ alloc = ggml_allocr_new(data_compute.data(), data_compute.size(), tensor_alignment);
356
+ gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
357
+ ggml_allocr_alloc_graph(alloc, gf);
358
+ ggml_allocr_free(alloc);
359
+
360
+ struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
361
+ static std::vector<uint8_t> data_work;
362
+ data_work.resize(cplan.work_size);
363
+ cplan.work_data = data_work.data();
364
+
365
+ ggml_graph_compute(gf, &cplan);
366
+
367
+ ggml_free(ctx);
368
+ return true;
369
+ }
370
+
371
+ static void export_lora(struct export_lora_params * params) {
372
+ // load all loras
373
+ std::vector<struct lora_data *> loras;
374
+ for (size_t i = 0; i < params->lora.size(); ++i) {
375
+ struct lora_data * lora = load_lora(&params->lora[i]);
376
+ if (lora != NULL) {
377
+ loras.push_back(lora);
378
+ }
379
+ }
380
+ if (loras.size() == 0) {
381
+ fprintf(stderr, "warning: no lora adapters will be applied.\n");
382
+ }
383
+
384
+ // open input file
385
+ struct llama_file fin(params->fn_model_base.c_str(), "rb");
386
+ if (!fin.fp) {
387
+ die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
388
+ }
389
+
390
+ // open base model gguf, read tensors without their data
391
+ struct ggml_context * ctx_in;
392
+ struct gguf_init_params params_gguf;
393
+ params_gguf.no_alloc = true;
394
+ params_gguf.ctx = &ctx_in;
395
+ struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
396
+
397
+ // create new gguf
398
+ struct gguf_context * gguf_out = gguf_init_empty();
399
+
400
+ // copy meta data from base model: kv and tensors
401
+ gguf_set_kv(gguf_out, gguf_in);
402
+ int n_tensors = gguf_get_n_tensors(gguf_in);
403
+ for (int i=0; i < n_tensors; ++i) {
404
+ const char * name = gguf_get_tensor_name(gguf_in, i);
405
+ struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
406
+ gguf_add_tensor(gguf_out, tensor);
407
+ }
408
+
409
+ // create output file
410
+ struct llama_file fout(params->fn_model_out.c_str(), "wb");
411
+ if (!fout.fp) {
412
+ die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
413
+ }
414
+
415
+ // write gguf meta data
416
+ std::vector<uint8_t> meta;
417
+ meta.resize(gguf_get_meta_size(gguf_out));
418
+ gguf_get_meta_data(gguf_out, meta.data());
419
+ fout.write_raw(meta.data(), meta.size());
420
+
421
+ std::vector<uint8_t> data;
422
+ std::vector<uint8_t> padding;
423
+ for (int i=0; i < n_tensors; ++i) {
424
+ const char * name = gguf_get_tensor_name(gguf_in, i);
425
+ struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
426
+
427
+ // read tensor data
428
+ data.resize(ggml_nbytes(tensor));
429
+ tensor->data = data.data();
430
+ size_t offset = gguf_get_tensor_offset(gguf_in, i);
431
+ fin.seek(offset + meta.size(), SEEK_SET);
432
+ fin.read_raw(data.data(), data.size());
433
+
434
+ // apply all loras
435
+ for (size_t k = 0; k < loras.size(); ++k) {
436
+ apply_lora(tensor, loras[k], params->n_threads);
437
+ }
438
+
439
+ // write tensor data + padding
440
+ padding.clear();
441
+ padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
442
+
443
+ GGML_ASSERT(fout.tell() == offset + meta.size());
444
+ // fout.seek(offset + meta.size(), SEEK_SET);
445
+ fout.write_raw(data.data(), data.size());
446
+ fout.write_raw(padding.data(), padding.size());
447
+
448
+ if (i % 2 == 0) {
449
+ printf(".");
450
+ }
451
+ }
452
+ printf("\n");
453
+
454
+ // close gguf
455
+ gguf_free(gguf_out);
456
+ gguf_free(gguf_in);
457
+
458
+ // free loras
459
+ for (size_t i = 0; i < loras.size(); ++i) {
460
+ free_lora(loras[i]);
461
+ }
462
+ }
463
+
464
+ int main(int argc, char ** argv) {
465
+ struct export_lora_params params = get_default_export_lora_params();
466
+
467
+ if (!export_lora_params_parse(argc, argv, &params)) {
468
+ return 1;
469
+ }
470
+
471
+ export_lora(&params);
472
+
473
+ return 0;
474
+ }
examples/finetune/CMakeLists.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ set(TARGET finetune)
2
+ add_executable(${TARGET} finetune.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
examples/finetune/README.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # finetune
2
+
3
+ Basic usage instructions:
4
+
5
+ ```bash
6
+ # get training data
7
+ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
8
+
9
+ # finetune LORA adapter
10
+ ./bin/finetune \
11
+ --model-base open-llama-3b-v2-q8_0.gguf \
12
+ --checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
13
+ --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
14
+ --lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
15
+ --train-data "shakespeare.txt" \
16
+ --save-every 10 \
17
+ --threads 6 --adam-iter 30 --batch 4 --ctx 64 \
18
+ --use-checkpointing
19
+
20
+ # predict
21
+ ./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
22
+ ```
23
+
24
+ Finetune output files will be saved every N iterations (config with `--save-every N`).
25
+ The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
26
+ So in above example after 10 iterations these files will be written:
27
+ - chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
28
+ - chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
29
+ - lora-open-llama-3b-v2-q8_0-shakespeare-10.bin
30
+ - lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
31
+
32
+ After 10 more iterations:
33
+ - chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf
34
+ - chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
35
+ - lora-open-llama-3b-v2-q8_0-shakespeare-20.bin
36
+ - lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
37
+
38
+ Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
39
+
40
+ llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
41
+ These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above.
42
+
43
+ In `main` you can also load multiple LORA adapters, which will then be mixed together.
44
+
45
+ For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
46
+
47
+ ```bash
48
+ ./bin/main -m open-llama-3b-v2-q8_0.gguf \
49
+ --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
50
+ --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
51
+ ```
52
+
53
+ You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`.
54
+
55
+ For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
56
+
57
+ ```bash
58
+ ./bin/main -m open-llama-3b-v2-q8_0.gguf \
59
+ --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
60
+ --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
61
+ --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
62
+ ```
63
+
64
+ The scale numbers don't need to add up to one, and you can also use numbers creater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
65
+
66
+ Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
67
+ If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
68
+
69
+ The default LORA rank can be specified with `--lora-r N`.
70
+ The LORA rank can be configured for each model tensor type separately with these command line options:
71
+
72
+ ```bash
73
+ --lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4)
74
+ --rank-att-norm N LORA rank for attention norm tensor (default 1)
75
+ --rank-ffn-norm N LORA rank for feed-forward norm tensor (default 1)
76
+ --rank-out-norm N LORA rank for output norm tensor (default 1)
77
+ --rank-tok-embd N LORA rank for token embeddings tensor (default 4)
78
+ --rank-out N LORA rank for output tensor (default 4)
79
+ --rank-wq N LORA rank for wq tensor (default 4)
80
+ --rank-wk N LORA rank for wk tensor (default 4)
81
+ --rank-wv N LORA rank for wv tensor (default 4)
82
+ --rank-wo N LORA rank for wo tensor (default 4)
83
+ --rank-w1 N LORA rank for w1 tensor (default 4)
84
+ --rank-w2 N LORA rank for w2 tensor (default 4)
85
+ --rank-w3 N LORA rank for w3 tensor (default 4)
86
+ ```
87
+
88
+ The LORA rank of 'norm' tensors should always be 1.
89
+
90
+ To see all available options use `finetune --help`.
examples/finetune/convert-finetune-checkpoint-to-gguf.py ADDED
@@ -0,0 +1,489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # finetune checkpoint --> gguf conversion
3
+
4
+ import argparse
5
+ import gguf
6
+ import os
7
+ import struct
8
+ import sys
9
+ import numpy as np
10
+ from pathlib import Path
11
+
12
+ # gguf constants
13
+ LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
14
+ LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
15
+ LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
16
+ LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
17
+ LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
18
+ LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
19
+ LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
20
+ LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
21
+ LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
22
+ LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
23
+ LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
24
+ LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
25
+ LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
26
+ LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
27
+ LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
28
+ LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
29
+ LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
30
+ LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
31
+
32
+ LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
33
+ LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
34
+ LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
35
+
36
+ LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
37
+ LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
38
+ LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
39
+ LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
40
+ LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
41
+ LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
42
+ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
43
+ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
44
+ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
45
+ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
46
+
47
+ LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
48
+ LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
49
+ LLM_KV_TRAINING_TYPE = "training.type"
50
+ LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
51
+ LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
52
+ LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
53
+ LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
54
+
55
+ LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd"
56
+ LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
57
+ LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output"
58
+ LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm"
59
+ LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q"
60
+ LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k"
61
+ LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v"
62
+ LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output"
63
+ LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm"
64
+ LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate"
65
+ LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down"
66
+ LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up"
67
+
68
+ class Tensor:
69
+ def __init__(self, dtype='f', ne=None):
70
+ if ne is None:
71
+ ne = []
72
+ self.dtype = dtype
73
+ self.ne = ne
74
+ self.nbytes = 0
75
+ if self.dtype == 'f':
76
+ if len(self.ne) == 0:
77
+ self.nbytes = 0
78
+ else:
79
+ self.nbytes = int(np.product(self.ne)) * 4
80
+ else:
81
+ raise ValueError(f"Unhandled data type '{self.dtype}'")
82
+
83
+ def load(self, data, offset):
84
+ nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
85
+ namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
86
+ dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
87
+
88
+ assert(nd == len(self.ne))
89
+ ne = []
90
+ for d in range(nd):
91
+ n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
92
+ ne.append(n)
93
+
94
+ if tuple(ne) != tuple(self.ne):
95
+ raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
96
+
97
+ if self.dtype == 'f':
98
+ assert(dtype == 0)
99
+ else:
100
+ raise ValueError(f"Unhandled data type '{self.dtype}'")
101
+
102
+ self.name = bytes(data[offset:offset+namelen]); offset += namelen
103
+ # 32-byte alignment
104
+ offset += (0 - offset) & 31
105
+ self.data = data[offset:offset+self.nbytes]
106
+ offset += self.nbytes
107
+ return offset
108
+
109
+ def max_storage_size(self):
110
+ result = 0
111
+ result += 4 # nd
112
+ result += 4 # namelen
113
+ result += 4 # dtype
114
+ result += len(self.ne)*8 # ne
115
+ result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
116
+ result += 31 # 32-byte alignment
117
+ result += self.nbytes
118
+ return result
119
+
120
+ def save_gguf(self, gguf_writer, name):
121
+ gguf_writer.add_tensor(
122
+ name=name,
123
+ tensor=self.data,
124
+ raw_shape=np.array(list(reversed(self.ne))),
125
+ raw_dtype=gguf.GGMLQuantizationType.F32)
126
+
127
+ class OptimizationContext:
128
+ def __init__(self):
129
+ pass
130
+
131
+ def load(self, data, offset):
132
+ self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
133
+ offset += 4
134
+
135
+ if self.version != 1:
136
+ raise ValueError('Invalid version of optimization context in checkpoint file')
137
+
138
+ self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
139
+ self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
140
+ self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
141
+ self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
142
+ self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
143
+
144
+ self.adam_m = Tensor('f', [self.nx])
145
+ self.adam_v = Tensor('f', [self.nx])
146
+ self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
147
+
148
+ self.lbfgs_x = Tensor('f', [self.nx])
149
+ self.lbfgs_xp = Tensor('f', [self.nx])
150
+ self.lbfgs_g = Tensor('f', [self.nx])
151
+ self.lbfgs_gp = Tensor('f', [self.nx])
152
+ self.lbfgs_d = Tensor('f', [self.nx])
153
+ self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
154
+ self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
155
+ self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
156
+ self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
157
+ self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
158
+
159
+ # forgot to save type in version 1:
160
+ # guess self.type from number of remaining bytes
161
+ size_type_0 = 12 + sum([t.max_storage_size() for t in
162
+ [self.adam_m, self.adam_v]
163
+ +([self.adam_pf] if (self.past > 0) else [])])
164
+ size_type_1 = 24 + sum([t.max_storage_size() for t in
165
+ [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
166
+ self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
167
+ self.lbfgs_lmal, self.lbfgs_lmys,
168
+ self.lbfgs_lms, self.lbfgs_lmy]
169
+ +([self.lbfgs_pf] if (self.past > 0) else [])])
170
+ # due to alignment padding the size might not by exact
171
+ # but the difference in size for both types is significant,
172
+ # so we can just use whichever is closest
173
+ remaining = len(data) - offset
174
+ if abs(remaining - size_type_0) < abs(remaining - size_type_1):
175
+ self.type = 0
176
+ else:
177
+ self.type = 1
178
+
179
+ if self.type == 0:
180
+ offset = self.adam_m.load(data, offset)
181
+ offset = self.adam_v.load(data, offset)
182
+ offset = self.adam_pf.load(data,offset)
183
+
184
+ self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
185
+ self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
186
+ self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
187
+
188
+ elif self.type == 1:
189
+ offset = self.lbfgs_x.load(data, offset)
190
+ offset = self.lbfgs_xp.load(data, offset)
191
+ offset = self.lbfgs_g.load(data, offset)
192
+ offset = self.lbfgs_gp.load(data, offset)
193
+ offset = self.lbfgs_d.load(data, offset)
194
+ offset = self.lbfgs_pf.load(data, offset)
195
+ offset = self.lbfgs_lmal.load(data, offset)
196
+ offset = self.lbfgs_lmys.load(data, offset)
197
+ offset = self.lbfgs_lms.load(data, offset)
198
+ offset = self.lbfgs_lmy.load(data, offset)
199
+
200
+ self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
201
+ self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
202
+ self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
203
+ self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
204
+ self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
205
+ self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
206
+
207
+ else:
208
+ raise ValueError(f"Invalid optimizer type '{self.type}'")
209
+
210
+ return offset
211
+
212
+ def save_gguf(self, gguf_writer):
213
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
214
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
215
+ gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
216
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
217
+ gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
218
+
219
+ if self.type == 0:
220
+ gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
221
+ gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
222
+ gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
223
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
224
+
225
+ self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
226
+ self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
227
+ if self.past > 0:
228
+ self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
229
+
230
+ elif self.type == 1:
231
+ gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
232
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
233
+ gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
234
+ gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
235
+ gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
236
+ gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
237
+ gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
238
+ gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
239
+
240
+ self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
241
+ self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
242
+ self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
243
+ self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
244
+ self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
245
+ if self.past > 0:
246
+ self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
247
+ self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
248
+ self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
249
+ self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
250
+ self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
251
+ else:
252
+ raise ValueError('Unknown optimizer type')
253
+
254
+ class LoraParams:
255
+ def __init__(self):
256
+ pass
257
+
258
+ def load(self, data, offset):
259
+ self.n_rank_attention_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
260
+ self.n_rank_wq = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
261
+ self.n_rank_wk = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
262
+ self.n_rank_wv = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
263
+ self.n_rank_wo = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
264
+ self.n_rank_ffn_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
265
+ self.n_rank_w1 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
266
+ self.n_rank_w2 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
267
+ self.n_rank_w3 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
268
+ self.n_rank_tok_embeddings = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
269
+ self.n_rank_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
270
+ self.n_rank_output = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
271
+ return offset
272
+
273
+ def save_gguf(self, gguf_writer):
274
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD, self.n_rank_tok_embeddings)
275
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
276
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT, self.n_rank_output)
277
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM, self.n_rank_attention_norm)
278
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q, self.n_rank_wq)
279
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K, self.n_rank_wk)
280
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V, self.n_rank_wv)
281
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, self.n_rank_wo)
282
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM, self.n_rank_ffn_norm)
283
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE, self.n_rank_w1)
284
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, self.n_rank_w2)
285
+ gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP, self.n_rank_w3)
286
+
287
+ class ModelParams:
288
+ def __init__(self, n_ff = None):
289
+ self.n_ff = n_ff
290
+
291
+ def load(self, data, offset):
292
+ self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
293
+ self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
294
+ self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
295
+ self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
296
+ self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
297
+ self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
298
+ return offset
299
+
300
+ def get_n_ff(self):
301
+ if self.n_ff is None:
302
+ # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
303
+ return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
304
+ else:
305
+ return self.n_ff
306
+
307
+ def save_gguf(self, gguf_writer):
308
+ # self.n_vocab not saved
309
+ gguf_writer.add_embedding_length(self.n_embd)
310
+ gguf_writer.add_head_count(self.n_head)
311
+ gguf_writer.add_block_count(self.n_layer)
312
+ gguf_writer.add_rope_dimension_count(self.n_rot)
313
+ gguf_writer.add_feed_forward_length(self.get_n_ff())
314
+
315
+ def tensor_name(key, bid=None, suffix=".weight"):
316
+ return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + suffix
317
+
318
+ class Layer:
319
+ def __init__(self, params, lora_params, bid):
320
+ self.bid = bid
321
+ self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
322
+ self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
323
+ self.wq_a = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
324
+ self.wq_b = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
325
+ self.wk_a = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
326
+ self.wk_b = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
327
+ self.wv_a = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
328
+ self.wv_b = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
329
+ self.wo_a = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
330
+ self.wo_b = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
331
+ self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
332
+ self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
333
+ self.w1_a = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
334
+ self.w1_b = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
335
+ self.w2_a = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
336
+ self.w2_b = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
337
+ self.w3_a = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
338
+ self.w3_b = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
339
+
340
+ def load(self, data, offset):
341
+ offset = self.att_norm_a.load(data, offset)
342
+ offset = self.att_norm_b.load(data, offset)
343
+ offset = self.wq_a.load(data, offset)
344
+ offset = self.wq_b.load(data, offset)
345
+ offset = self.wk_a.load(data, offset)
346
+ offset = self.wk_b.load(data, offset)
347
+ offset = self.wv_a.load(data, offset)
348
+ offset = self.wv_b.load(data, offset)
349
+ offset = self.wo_a.load(data, offset)
350
+ offset = self.wo_b.load(data, offset)
351
+ offset = self.ffn_norm_a.load(data, offset)
352
+ offset = self.ffn_norm_b.load(data, offset)
353
+ offset = self.w1_a.load(data, offset)
354
+ offset = self.w1_b.load(data, offset)
355
+ offset = self.w2_a.load(data, offset)
356
+ offset = self.w2_b.load(data, offset)
357
+ offset = self.w3_a.load(data, offset)
358
+ offset = self.w3_b.load(data, offset)
359
+ return offset
360
+
361
+ def save_gguf(self, gguf_writer):
362
+ self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
363
+ self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
364
+ self.wq_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_a"))
365
+ self.wq_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_b"))
366
+ self.wk_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_a"))
367
+ self.wk_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_b"))
368
+ self.wv_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_a"))
369
+ self.wv_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_b"))
370
+ self.wo_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_a"))
371
+ self.wo_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_b"))
372
+ self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_a"))
373
+ self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_b"))
374
+ self.w1_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_a"))
375
+ self.w1_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_b"))
376
+ self.w2_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_a"))
377
+ self.w2_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_b"))
378
+ self.w3_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_a"))
379
+ self.w3_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_b"))
380
+
381
+ class LoraModel:
382
+ def __init__(self, n_ff = None):
383
+ self.params = ModelParams(n_ff = n_ff)
384
+ self.lora_params = LoraParams()
385
+ self.layers = []
386
+
387
+ def load(self, data, offset):
388
+ offset = self.params.load(data, offset)
389
+ offset = self.lora_params.load(data, offset)
390
+
391
+ self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
392
+ self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
393
+ self.norm_a = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
394
+ self.norm_b = Tensor('f', [self.lora_params.n_rank_norm, 1])
395
+ self.output_a = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
396
+ self.output_b = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
397
+
398
+ offset = self.tok_embd_a.load(data, offset)
399
+ offset = self.tok_embd_b.load(data, offset)
400
+ offset = self.norm_a.load(data, offset)
401
+ offset = self.norm_b.load(data, offset)
402
+ offset = self.output_a.load(data, offset)
403
+ offset = self.output_b.load(data, offset)
404
+
405
+ self.layers.clear()
406
+ for bid in range(self.params.n_layer):
407
+ layer = Layer(self.params, self.lora_params, bid)
408
+ offset = layer.load(data, offset)
409
+ self.layers.append(layer)
410
+
411
+ return offset
412
+
413
+ def save_gguf(self, gguf_writer):
414
+ self.params.save_gguf(gguf_writer)
415
+ self.lora_params.save_gguf(gguf_writer)
416
+
417
+ self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_a"))
418
+ self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_b"))
419
+ self.norm_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
420
+ self.norm_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
421
+ self.output_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_a"))
422
+ self.output_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_b"))
423
+
424
+ for layer in self.layers:
425
+ layer.save_gguf(gguf_writer)
426
+
427
+ class LoraCheckpoint:
428
+ def __init__(self, n_ff = None):
429
+ self.model = LoraModel(n_ff = n_ff)
430
+ self.opt_ctx = OptimizationContext()
431
+
432
+ def load(self, data, offset):
433
+ magic = bytes(reversed(data[offset:offset + 4])); offset += 4
434
+ if magic != b'ggcl':
435
+ raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
436
+
437
+ self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
438
+ if self.version != 0:
439
+ raise ValueError('Invalid version of checkpoint file')
440
+
441
+ self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
442
+ self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
443
+ self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
444
+
445
+ offset = self.model.load(data, offset)
446
+ offset = self.opt_ctx.load(data, offset)
447
+
448
+ return offset
449
+
450
+ def save_gguf(self, gguf_writer):
451
+ gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
452
+ gguf_writer.add_layer_norm_rms_eps(1e-5)
453
+ gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
454
+ gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
455
+ gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
456
+ gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
457
+ gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
458
+ self.model.save_gguf(gguf_writer)
459
+ self.opt_ctx.save_gguf(gguf_writer)
460
+
461
+ def handle_args():
462
+ parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
463
+ parser.add_argument('--input', '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
464
+ parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
465
+ parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
466
+ return parser.parse_args()
467
+
468
+ def main():
469
+ cfg = handle_args()
470
+ print(cfg)
471
+ data = np.memmap(cfg.input, mode = 'r')
472
+ chk = LoraCheckpoint(n_ff = cfg.ff)
473
+ offset = 0
474
+ offset = chk.load(data, offset)
475
+ # we should have read all available data
476
+ assert(offset == len(data))
477
+
478
+ gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
479
+ chk.save_gguf(gguf_writer)
480
+ print(" gguf: write header")
481
+ gguf_writer.write_header_to_file()
482
+ print(" gguf: write metadata")
483
+ gguf_writer.write_kv_data_to_file()
484
+ print(" gguf: write tensors")
485
+ gguf_writer.write_tensors_to_file()
486
+ gguf_writer.close()
487
+
488
+ if __name__ == '__main__':
489
+ main()
examples/finetune/finetune.cpp ADDED
@@ -0,0 +1,1940 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml.h"
2
+ #include "ggml-alloc.h"
3
+ #include "llama.h"
4
+ #include "common.h"
5
+ #include "train.h"
6
+ #include <unordered_map>
7
+ #include <vector>
8
+ #include <cassert>
9
+ #include <climits>
10
+ #include <cstring>
11
+ #include <cstdarg>
12
+ #include <ctime>
13
+ #include <random>
14
+ #include <stdexcept>
15
+ #include <algorithm>
16
+ #include <string>
17
+
18
+ #if defined(_MSC_VER)
19
+ #pragma warning(disable: 4244 4267) // possible loss of data
20
+ #endif
21
+
22
+ static const size_t tensor_alignment = 32;
23
+
24
+ struct my_llama_hparams {
25
+ uint32_t n_vocab = 32000;
26
+ uint32_t n_ctx = 512;
27
+ uint32_t n_embd = 4096;
28
+ uint32_t n_ff = 11008;
29
+ uint32_t n_head = 32;
30
+ uint32_t n_head_kv = 32;
31
+ uint32_t n_layer = 32;
32
+
33
+ // float f_norm_eps = 1e-5f; // falcon
34
+ float f_norm_rms_eps = 1e-5f; // llama
35
+
36
+ float rope_freq_base = 10000.0f;
37
+ float rope_freq_scale = 1.0f;
38
+
39
+ uint32_t n_gqa() const {
40
+ return n_head/n_head_kv;
41
+ }
42
+
43
+ uint32_t n_embd_head() const {
44
+ return n_embd/n_head;
45
+ }
46
+
47
+ uint32_t n_embd_gqa() const {
48
+ return n_embd/n_gqa();
49
+ }
50
+
51
+ bool operator!=(const my_llama_hparams& other) const {
52
+ return memcmp(this, &other, sizeof(other));
53
+ }
54
+ };
55
+
56
+ struct my_llama_layer {
57
+ // normalization
58
+ struct ggml_tensor * attention_norm;
59
+
60
+ // attention
61
+ struct ggml_tensor * wq;
62
+ struct ggml_tensor * wk;
63
+ struct ggml_tensor * wv;
64
+ struct ggml_tensor * wo;
65
+
66
+ // normalization
67
+ struct ggml_tensor * ffn_norm;
68
+
69
+ // ff
70
+ struct ggml_tensor * w1;
71
+ struct ggml_tensor * w2;
72
+ struct ggml_tensor * w3;
73
+ };
74
+
75
+ struct my_llama_model {
76
+ struct my_llama_hparams hparams;
77
+
78
+ struct ggml_tensor * tok_embeddings;
79
+
80
+ struct ggml_tensor * norm;
81
+ struct ggml_tensor * output;
82
+
83
+ std::vector<my_llama_layer> layers;
84
+ };
85
+
86
+ struct my_llama_lora_hparams {
87
+ uint32_t lora_r = 1;
88
+ uint32_t lora_alpha = 1;
89
+ uint32_t n_rank_attention_norm = 1;
90
+ uint32_t n_rank_wq = 4;
91
+ uint32_t n_rank_wk = 4;
92
+ uint32_t n_rank_wv = 4;
93
+ uint32_t n_rank_wo = 4;
94
+ uint32_t n_rank_ffn_norm = 1;
95
+ uint32_t n_rank_w1 = 4;
96
+ uint32_t n_rank_w2 = 4;
97
+ uint32_t n_rank_w3 = 4;
98
+ uint32_t n_rank_tok_embeddings = 4;
99
+ uint32_t n_rank_norm = 1;
100
+ uint32_t n_rank_output = 4;
101
+
102
+ bool operator!=(const my_llama_lora_hparams& other) const {
103
+ return memcmp(this, &other, sizeof(other));
104
+ }
105
+ };
106
+
107
+ struct my_llama_lora_layer {
108
+ // normalization
109
+ struct ggml_tensor * attention_norm_a;
110
+ struct ggml_tensor * attention_norm_b;
111
+
112
+ // attention
113
+ struct ggml_tensor * wq_a;
114
+ struct ggml_tensor * wq_b;
115
+ struct ggml_tensor * wk_a;
116
+ struct ggml_tensor * wk_b;
117
+ struct ggml_tensor * wv_a;
118
+ struct ggml_tensor * wv_b;
119
+ struct ggml_tensor * wo_a;
120
+ struct ggml_tensor * wo_b;
121
+
122
+ // normalization
123
+ struct ggml_tensor * ffn_norm_a;
124
+ struct ggml_tensor * ffn_norm_b;
125
+
126
+ // ff
127
+ struct ggml_tensor * w1_a;
128
+ struct ggml_tensor * w1_b;
129
+ struct ggml_tensor * w2_a;
130
+ struct ggml_tensor * w2_b;
131
+ struct ggml_tensor * w3_a;
132
+ struct ggml_tensor * w3_b;
133
+ };
134
+
135
+ struct my_llama_lora {
136
+ struct ggml_context * ctx = NULL;
137
+ std::vector<uint8_t> data;
138
+
139
+ my_llama_lora_hparams hparams;
140
+
141
+ struct ggml_tensor * tok_embeddings_a;
142
+ struct ggml_tensor * tok_embeddings_b;
143
+
144
+ struct ggml_tensor * norm_a;
145
+ struct ggml_tensor * norm_b;
146
+ struct ggml_tensor * output_a;
147
+ struct ggml_tensor * output_b;
148
+
149
+ std::vector<my_llama_lora_layer> layers;
150
+ };
151
+
152
+ // gguf constants
153
+ static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora";
154
+ static const char * LLM_KV_TRAINING_TYPE = "training.type";
155
+
156
+ static const char * LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd";
157
+ static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm";
158
+ static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output";
159
+ static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm";
160
+ static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q";
161
+ static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k";
162
+ static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v";
163
+ static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output";
164
+ static const char * LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm";
165
+ static const char * LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate";
166
+ static const char * LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down";
167
+ static const char * LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up";
168
+
169
+ // gguf constants (sync with gguf.py)
170
+
171
+ static const char * LLM_KV_GENERAL_ARCHITECTURE = "general.architecture";
172
+ static const char * LLM_KV_GENERAL_FILE_TYPE = "general.file_type";
173
+
174
+ static const char * LLM_KV_CONTEXT_LENGTH = "%s.context_length";
175
+ static const char * LLM_KV_EMBEDDING_LENGTH = "%s.embedding_length";
176
+ static const char * LLM_KV_BLOCK_COUNT = "%s.block_count";
177
+ static const char * LLM_KV_FEED_FORWARD_LENGTH = "%s.feed_forward_length";
178
+ static const char * LLM_KV_ATTENTION_HEAD_COUNT = "%s.attention.head_count";
179
+ static const char * LLM_KV_ATTENTION_HEAD_COUNT_KV = "%s.attention.head_count_kv";
180
+ static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
181
+ static const char * LLM_KV_ROPE_DIMENSION_COUNT = "%s.rope.dimension_count";
182
+ static const char * LLM_KV_ROPE_FREQ_BASE = "%s.rope.freq_base"; // TODO load in llama.cpp
183
+ static const char * LLM_KV_ROPE_SCALE_LINEAR = "%s.rope.scale_linear";
184
+
185
+ static const char * LLM_TENSOR_TOKEN_EMBD = "token_embd";
186
+ static const char * LLM_TENSOR_OUTPUT_NORM = "output_norm";
187
+ static const char * LLM_TENSOR_OUTPUT = "output";
188
+ static const char * LLM_TENSOR_ATTN_NORM = "blk.%d.attn_norm";
189
+ static const char * LLM_TENSOR_ATTN_Q = "blk.%d.attn_q";
190
+ static const char * LLM_TENSOR_ATTN_K = "blk.%d.attn_k";
191
+ static const char * LLM_TENSOR_ATTN_V = "blk.%d.attn_v";
192
+ static const char * LLM_TENSOR_ATTN_OUT = "blk.%d.attn_output";
193
+ static const char * LLM_TENSOR_FFN_NORM = "blk.%d.ffn_norm";
194
+ static const char * LLM_TENSOR_FFN_GATE = "blk.%d.ffn_gate";
195
+ static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down";
196
+ static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up";
197
+
198
+ static void print_params(struct my_llama_hparams * params) {
199
+ printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
200
+ printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
201
+ printf("%s: n_embd: %u\n", __func__, params->n_embd);
202
+ printf("%s: n_ff: %u\n", __func__, params->n_ff);
203
+ printf("%s: n_head: %u\n", __func__, params->n_head);
204
+ printf("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
205
+ printf("%s: n_layer: %u\n", __func__, params->n_layer);
206
+ printf("%s: norm_rms_eps : %f\n", __func__, params->f_norm_rms_eps);
207
+ printf("%s: rope_freq_base : %f\n", __func__, params->rope_freq_base);
208
+ printf("%s: rope_freq_scale : %f\n", __func__, params->rope_freq_scale);
209
+ }
210
+
211
+ static void print_lora_params(struct my_llama_lora_hparams * params) {
212
+ printf("%s: n_rank_attention_norm : %u\n", __func__, params->n_rank_attention_norm);
213
+ printf("%s: n_rank_wq : %u\n", __func__, params->n_rank_wq);
214
+ printf("%s: n_rank_wk : %u\n", __func__, params->n_rank_wk);
215
+ printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv);
216
+ printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo);
217
+ printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm);
218
+ printf("%s: n_rank_w1 : %u\n", __func__, params->n_rank_w1);
219
+ printf("%s: n_rank_w2 : %u\n", __func__, params->n_rank_w2);
220
+ printf("%s: n_rank_w3 : %u\n", __func__, params->n_rank_w3);
221
+ printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings);
222
+ printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm);
223
+ printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output);
224
+ }
225
+
226
+ #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
227
+ { \
228
+ const std::string skey(key); \
229
+ const int kid = gguf_find_key(ctx, skey.c_str()); \
230
+ if (kid >= 0) { \
231
+ enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
232
+ if (ktype != (type)) { \
233
+ die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
234
+ } \
235
+ (dst) = func(ctx, kid); \
236
+ } else if (req) { \
237
+ die_fmt("key not found in model: %s", skey.c_str()); \
238
+ } \
239
+ }
240
+
241
+ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_hparams * hparams, const char * expected_arch) {
242
+ std::string arch;
243
+
244
+ GGUF_GET_KEY(ctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE);
245
+ if (expected_arch != NULL) {
246
+ if (arch != expected_arch) {
247
+ printf("%s: arch=%s expected_arch=%s\n", __func__, arch.c_str(), expected_arch);
248
+ }
249
+ GGML_ASSERT(arch == expected_arch);
250
+ }
251
+
252
+ std::vector<char> keybuf;
253
+ keybuf.resize(512);
254
+ auto kv = [&arch, &keybuf](const char * key) -> const char * {
255
+ snprintf(keybuf.data(), keybuf.size(), key, arch.c_str());
256
+ return keybuf.data();
257
+ };
258
+
259
+ GGUF_GET_KEY(ctx, hparams->n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
260
+ GGUF_GET_KEY(ctx, hparams->n_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
261
+ GGUF_GET_KEY(ctx, hparams->n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
262
+ GGUF_GET_KEY(ctx, hparams->n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
263
+ GGUF_GET_KEY(ctx, hparams->n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
264
+
265
+ // n_head_kv is optional, default to n_head
266
+ hparams->n_head_kv = hparams->n_head;
267
+ GGUF_GET_KEY(ctx, hparams->n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
268
+
269
+ float rope_freq_scale = 1.0f;
270
+ GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
271
+ GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
272
+ GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
273
+ if (rope_freq_scale != 1.0f) {
274
+ hparams->rope_freq_scale = 1.0f / rope_freq_scale;
275
+ }
276
+ }
277
+
278
+ static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t n_ctx) {
279
+ auto & hparams = model->hparams;
280
+
281
+ std::vector<char> tn_buf;
282
+ tn_buf.resize(GGML_MAX_NAME);
283
+ auto tn = [&tn_buf](const char * key) -> const char * {
284
+ snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key);
285
+ return tn_buf.data();
286
+ };
287
+ auto tni = [&tn_buf](const char * key, int bid) -> const char * {
288
+ snprintf(tn_buf.data(), tn_buf.size(), key, bid);
289
+ std::string s = tn_buf.data();
290
+ snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str());
291
+ return tn_buf.data();
292
+ };
293
+
294
+
295
+ // get parameters directly from gguf file
296
+ {
297
+ struct gguf_init_params params = {
298
+ /*.no_alloc = */ false,
299
+ /*.ctx = */ NULL,
300
+ };
301
+ struct gguf_context * mctx = gguf_init_from_file(fn_model, params);
302
+
303
+ load_model_hparams_gguf(mctx, &hparams, "llama");
304
+
305
+ gguf_free(mctx);
306
+ }
307
+ hparams.n_vocab = llama_n_vocab(input);
308
+ hparams.n_ctx = n_ctx;
309
+
310
+ // get tensors from llama_model (possibly mmapped)
311
+ model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD));
312
+ model->norm = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT_NORM));
313
+ model->output = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT));
314
+
315
+ assert_shape_2d(model->tok_embeddings, hparams.n_embd, hparams.n_vocab);
316
+ assert_shape_1d(model->norm, hparams.n_embd);
317
+ assert_shape_2d(model->output, hparams.n_embd, hparams.n_vocab);
318
+
319
+ model->layers.resize(hparams.n_layer);
320
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
321
+ auto & layer = model->layers[i];
322
+
323
+ layer.attention_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_NORM, i));
324
+ layer.wq = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_Q, i));
325
+ layer.wk = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_K, i));
326
+ layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i));
327
+ layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i));
328
+ layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i));
329
+ layer.w1 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
330
+ layer.w2 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
331
+ layer.w3 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
332
+
333
+ assert_shape_1d(layer.attention_norm, hparams.n_embd);
334
+ assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd);
335
+ assert_shape_2d(layer.wk, hparams.n_embd, hparams.n_embd);
336
+ assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd);
337
+ assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd);
338
+ assert_shape_1d(layer.ffn_norm, hparams.n_embd);
339
+ assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff);
340
+ assert_shape_2d(layer.w2, hparams.n_ff, hparams.n_embd);
341
+ assert_shape_2d(layer.w3, hparams.n_embd, hparams.n_ff);
342
+ }
343
+ }
344
+
345
+ static void set_param_lora(struct my_llama_lora * lora) {
346
+ const uint32_t n_layer = lora->layers.size();
347
+
348
+ struct ggml_context* ctx = lora->ctx;
349
+
350
+ ggml_set_param(ctx, lora->tok_embeddings_a);
351
+ ggml_set_param(ctx, lora->tok_embeddings_b);
352
+ ggml_set_param(ctx, lora->norm_a);
353
+ ggml_set_param(ctx, lora->norm_b);
354
+ ggml_set_param(ctx, lora->output_a);
355
+ ggml_set_param(ctx, lora->output_b);
356
+
357
+ for (uint32_t i = 0; i < n_layer; ++i) {
358
+ auto & layer = lora->layers[i];
359
+
360
+ ggml_set_param(ctx, layer.attention_norm_a);
361
+ ggml_set_param(ctx, layer.attention_norm_b);
362
+ ggml_set_param(ctx, layer.wq_a);
363
+ ggml_set_param(ctx, layer.wq_b);
364
+ ggml_set_param(ctx, layer.wk_a);
365
+ ggml_set_param(ctx, layer.wk_b);
366
+ ggml_set_param(ctx, layer.wv_a);
367
+ ggml_set_param(ctx, layer.wv_b);
368
+ ggml_set_param(ctx, layer.wo_a);
369
+ ggml_set_param(ctx, layer.wo_b);
370
+ ggml_set_param(ctx, layer.ffn_norm_a);
371
+ ggml_set_param(ctx, layer.ffn_norm_b);
372
+ ggml_set_param(ctx, layer.w1_a);
373
+ ggml_set_param(ctx, layer.w1_b);
374
+ ggml_set_param(ctx, layer.w2_a);
375
+ ggml_set_param(ctx, layer.w2_b);
376
+ ggml_set_param(ctx, layer.w3_a);
377
+ ggml_set_param(ctx, layer.w3_b);
378
+ }
379
+ }
380
+
381
+ static void alloc_lora(struct ggml_allocr * alloc, struct my_llama_lora * lora) {
382
+ ggml_allocr_alloc(alloc, lora->tok_embeddings_a);
383
+ ggml_allocr_alloc(alloc, lora->tok_embeddings_b);
384
+ ggml_allocr_alloc(alloc, lora->norm_a);
385
+ ggml_allocr_alloc(alloc, lora->norm_b);
386
+ ggml_allocr_alloc(alloc, lora->output_a);
387
+ ggml_allocr_alloc(alloc, lora->output_b);
388
+ for (uint32_t i = 0; i < lora->layers.size(); ++i) {
389
+ auto & layer = lora->layers[i];
390
+ ggml_allocr_alloc(alloc, layer.attention_norm_a);
391
+ ggml_allocr_alloc(alloc, layer.attention_norm_b);
392
+ ggml_allocr_alloc(alloc, layer.wq_a);
393
+ ggml_allocr_alloc(alloc, layer.wq_b);
394
+ ggml_allocr_alloc(alloc, layer.wk_a);
395
+ ggml_allocr_alloc(alloc, layer.wk_b);
396
+ ggml_allocr_alloc(alloc, layer.wv_a);
397
+ ggml_allocr_alloc(alloc, layer.wv_b);
398
+ ggml_allocr_alloc(alloc, layer.wo_a);
399
+ ggml_allocr_alloc(alloc, layer.wo_b);
400
+ ggml_allocr_alloc(alloc, layer.ffn_norm_a);
401
+ ggml_allocr_alloc(alloc, layer.ffn_norm_b);
402
+ ggml_allocr_alloc(alloc, layer.w1_a);
403
+ ggml_allocr_alloc(alloc, layer.w1_b);
404
+ ggml_allocr_alloc(alloc, layer.w2_a);
405
+ ggml_allocr_alloc(alloc, layer.w2_b);
406
+ ggml_allocr_alloc(alloc, layer.w3_a);
407
+ ggml_allocr_alloc(alloc, layer.w3_b);
408
+ }
409
+ ggml_allocr_alloc(alloc, lora->tok_embeddings_a->grad);
410
+ ggml_allocr_alloc(alloc, lora->tok_embeddings_b->grad);
411
+ ggml_allocr_alloc(alloc, lora->norm_a->grad);
412
+ ggml_allocr_alloc(alloc, lora->norm_b->grad);
413
+ ggml_allocr_alloc(alloc, lora->output_a->grad);
414
+ ggml_allocr_alloc(alloc, lora->output_b->grad);
415
+ for (uint32_t i = 0; i < lora->layers.size(); ++i) {
416
+ auto & layer = lora->layers[i];
417
+ ggml_allocr_alloc(alloc, layer.attention_norm_a->grad);
418
+ ggml_allocr_alloc(alloc, layer.attention_norm_b->grad);
419
+ ggml_allocr_alloc(alloc, layer.wq_a->grad);
420
+ ggml_allocr_alloc(alloc, layer.wq_b->grad);
421
+ ggml_allocr_alloc(alloc, layer.wk_a->grad);
422
+ ggml_allocr_alloc(alloc, layer.wk_b->grad);
423
+ ggml_allocr_alloc(alloc, layer.wv_a->grad);
424
+ ggml_allocr_alloc(alloc, layer.wv_b->grad);
425
+ ggml_allocr_alloc(alloc, layer.wo_a->grad);
426
+ ggml_allocr_alloc(alloc, layer.wo_b->grad);
427
+ ggml_allocr_alloc(alloc, layer.ffn_norm_a->grad);
428
+ ggml_allocr_alloc(alloc, layer.ffn_norm_b->grad);
429
+ ggml_allocr_alloc(alloc, layer.w1_a->grad);
430
+ ggml_allocr_alloc(alloc, layer.w1_b->grad);
431
+ ggml_allocr_alloc(alloc, layer.w2_a->grad);
432
+ ggml_allocr_alloc(alloc, layer.w2_b->grad);
433
+ ggml_allocr_alloc(alloc, layer.w3_a->grad);
434
+ ggml_allocr_alloc(alloc, layer.w3_b->grad);
435
+ }
436
+ }
437
+
438
+ static void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) {
439
+ const auto & lparams = lora->hparams;
440
+
441
+ const uint32_t n_embd = model->hparams.n_embd;
442
+ const uint32_t n_embd_gqa = model->hparams.n_embd_gqa();
443
+ const uint32_t n_layer = model->hparams.n_layer;
444
+ const uint32_t n_vocab = model->hparams.n_vocab;
445
+ const uint32_t n_ff = model->hparams.n_ff;
446
+
447
+ std::vector<char> tn_buf;
448
+ tn_buf.resize(GGML_MAX_NAME);
449
+ auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * {
450
+ snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix);
451
+ return tn_buf.data();
452
+ };
453
+ auto tni = [&tn_buf](const char * key, const char * suffix, int bid) -> const char * {
454
+ snprintf(tn_buf.data(), tn_buf.size(), key, bid);
455
+ std::string s = tn_buf.data();
456
+ snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix);
457
+ return tn_buf.data();
458
+ };
459
+
460
+ // context for lora tensors without their data
461
+ struct ggml_init_params ctx_lora_params;
462
+ ctx_lora_params.mem_size = ggml_tensor_overhead()*2*(6 + n_layer*18);
463
+ ctx_lora_params.mem_buffer = NULL;
464
+ ctx_lora_params.no_alloc = true;
465
+
466
+ struct ggml_context * ctx = ggml_init(ctx_lora_params);
467
+ lora->ctx = ctx;
468
+
469
+ lora->tok_embeddings_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_embd);
470
+ lora->tok_embeddings_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_vocab);
471
+ lora->norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, n_embd);
472
+ lora->norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, 1);
473
+ lora->output_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_embd);
474
+ lora->output_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_vocab);
475
+
476
+ ggml_set_name(lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.lora_a"));
477
+ ggml_set_name(lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.lora_b"));
478
+ ggml_set_name(lora->norm_a, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_a"));
479
+ ggml_set_name(lora->norm_b, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_b"));
480
+ ggml_set_name(lora->output_a, tn(LLM_TENSOR_OUTPUT, ".weight.lora_a"));
481
+ ggml_set_name(lora->output_b, tn(LLM_TENSOR_OUTPUT, ".weight.lora_b"));
482
+
483
+ lora->layers.resize(n_layer);
484
+ for (uint32_t i = 0; i < n_layer; ++i) {
485
+ auto & layer = lora->layers[i];
486
+
487
+ layer.attention_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, n_embd);
488
+ layer.attention_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, 1);
489
+
490
+ layer.wq_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd);
491
+ layer.wq_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd);
492
+ layer.wk_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd);
493
+ layer.wk_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd_gqa);
494
+ layer.wv_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd);
495
+ layer.wv_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd_gqa);
496
+ layer.wo_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd);
497
+ layer.wo_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd);
498
+
499
+ layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd);
500
+ layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1);
501
+
502
+ layer.w1_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_embd);
503
+ layer.w1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_ff);
504
+ layer.w2_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_ff);
505
+ layer.w2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_embd);
506
+ layer.w3_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_embd);
507
+ layer.w3_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_ff);
508
+
509
+ ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i));
510
+ ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i));
511
+ ggml_set_name(layer.wq_a, tni(LLM_TENSOR_ATTN_Q, ".weight.lora_a", i));
512
+ ggml_set_name(layer.wq_b, tni(LLM_TENSOR_ATTN_Q, ".weight.lora_b", i));
513
+ ggml_set_name(layer.wk_a, tni(LLM_TENSOR_ATTN_K, ".weight.lora_a", i));
514
+ ggml_set_name(layer.wk_b, tni(LLM_TENSOR_ATTN_K, ".weight.lora_b", i));
515
+ ggml_set_name(layer.wv_a, tni(LLM_TENSOR_ATTN_V, ".weight.lora_a", i));
516
+ ggml_set_name(layer.wv_b, tni(LLM_TENSOR_ATTN_V, ".weight.lora_b", i));
517
+ ggml_set_name(layer.wo_a, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_a", i));
518
+ ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i));
519
+ ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i));
520
+ ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i));
521
+ ggml_set_name(layer.w1_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i));
522
+ ggml_set_name(layer.w1_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i));
523
+ ggml_set_name(layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i));
524
+ ggml_set_name(layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i));
525
+ ggml_set_name(layer.w3_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i));
526
+ ggml_set_name(layer.w3_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i));
527
+ }
528
+
529
+ set_param_lora(lora);
530
+
531
+ // measure data size
532
+ struct ggml_allocr * alloc = NULL;
533
+ alloc = ggml_allocr_new_measure(tensor_alignment);
534
+ alloc_lora(alloc, lora);
535
+
536
+ // allocate data
537
+ lora->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
538
+ ggml_allocr_free(alloc);
539
+ alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
540
+ alloc_lora(alloc, lora);
541
+ ggml_allocr_free(alloc);
542
+ }
543
+
544
+ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
545
+ const uint32_t n_layer = lora->layers.size();
546
+
547
+ struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
548
+
549
+ randomize_tensor_normal(lora->tok_embeddings_a, rnd);
550
+ randomize_tensor_normal(lora->tok_embeddings_b, rnd);
551
+ randomize_tensor_normal(lora->norm_a, rnd);
552
+ randomize_tensor_normal(lora->norm_b, rnd);
553
+ randomize_tensor_normal(lora->output_a, rnd);
554
+ randomize_tensor_normal(lora->output_b, rnd);
555
+
556
+ for (uint32_t i = 0; i < n_layer; ++i) {
557
+ auto & layer = lora->layers[i];
558
+ randomize_tensor_normal(layer.attention_norm_a, rnd);
559
+ randomize_tensor_normal(layer.attention_norm_b, rnd);
560
+
561
+ randomize_tensor_normal(layer.wq_a, rnd);
562
+ randomize_tensor_normal(layer.wq_b, rnd);
563
+ randomize_tensor_normal(layer.wk_a, rnd);
564
+ randomize_tensor_normal(layer.wk_b, rnd);
565
+ randomize_tensor_normal(layer.wv_a, rnd);
566
+ randomize_tensor_normal(layer.wv_b, rnd);
567
+ randomize_tensor_normal(layer.wo_a, rnd);
568
+ randomize_tensor_normal(layer.wo_b, rnd);
569
+
570
+ randomize_tensor_normal(layer.ffn_norm_a, rnd);
571
+ randomize_tensor_normal(layer.ffn_norm_b, rnd);
572
+
573
+ randomize_tensor_normal(layer.w1_a, rnd);
574
+ randomize_tensor_normal(layer.w1_b, rnd);
575
+ randomize_tensor_normal(layer.w2_a, rnd);
576
+ randomize_tensor_normal(layer.w2_b, rnd);
577
+ randomize_tensor_normal(layer.w3_a, rnd);
578
+ randomize_tensor_normal(layer.w3_b, rnd);
579
+ }
580
+
581
+ free_random_normal_distribution(rnd);
582
+ }
583
+
584
+ static struct ggml_tensor * llama_build_lora_finetune_graphs(
585
+ struct my_llama_model * model,
586
+ struct my_llama_lora * lora,
587
+ struct ggml_allocr * alloc,
588
+ struct ggml_context * ctx,
589
+ struct ggml_cgraph * gf,
590
+ struct ggml_cgraph * gb,
591
+ struct ggml_cgraph * gb_tmp,
592
+ struct ggml_tensor * * logits,
593
+ struct ggml_tensor * tokens_input,
594
+ struct ggml_tensor * targets,
595
+ const int n_tokens,
596
+ const int n_batch,
597
+ const bool enable_flash_attn,
598
+ const bool enable_checkpointing) {
599
+
600
+ ggml_set_scratch(ctx, { 0, 0, nullptr, });
601
+ const int n_past = 0;
602
+ const int N = n_tokens;
603
+ const auto & hparams = model->hparams;
604
+ const int n_ctx = hparams.n_ctx;
605
+ const int n_vocab = hparams.n_vocab;
606
+ const int n_embd = hparams.n_embd;
607
+ const int n_layer = hparams.n_layer;
608
+ const int n_head = hparams.n_head;
609
+ const int n_head_kv = hparams.n_head_kv;
610
+ const int n_ff = hparams.n_ff;
611
+ const int n_rot = hparams.n_embd_head();
612
+ const int n_embd_head = hparams.n_embd_head();
613
+ const int n_embd_gqa = hparams.n_embd_gqa();
614
+ const float rms_norm_eps = hparams.f_norm_rms_eps;
615
+ const float rope_freq_base = hparams.rope_freq_base;
616
+ const float rope_freq_scale = hparams.rope_freq_scale;
617
+
618
+ GGML_ASSERT((size_t) n_layer == lora->layers.size());
619
+
620
+ auto set_name = [](struct ggml_tensor * t, const char * n) {
621
+ ggml_set_name(t, n);
622
+ if (t->grad) {
623
+ ggml_format_name(t->grad, "%s->grad", n);
624
+ }
625
+ };
626
+
627
+ // KQ_pos - contains the positions
628
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
629
+ ggml_allocr_alloc(alloc, KQ_pos);
630
+ if (!ggml_allocr_is_measure(alloc)) {
631
+ int * data = (int *) KQ_pos->data;
632
+ for (int i = 0; i < N; ++i) {
633
+ data[i] = n_past + i;
634
+ }
635
+ }
636
+
637
+ // rope has so much parameters that we make a custom function for it
638
+ auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
639
+ (struct ggml_tensor * t) -> struct ggml_tensor * {
640
+ // not capturing these, to silcence warnings
641
+ const int rope_mode = 0;
642
+
643
+ return ggml_rope_custom(ctx,
644
+ t, KQ_pos, n_rot, rope_mode, n_ctx,
645
+ rope_freq_base, rope_freq_scale);
646
+ };
647
+
648
+ set_name(tokens_input, "tokens_input");
649
+ set_name(targets, "targets");
650
+
651
+ GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
652
+
653
+ auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
654
+ if (ggml_is_quantized(a->type)) {
655
+ return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
656
+ } else if (a->type == GGML_TYPE_F32) {
657
+ return ggml_add(ctx, a, b);
658
+ } else {
659
+ die_fmt("%s: Finetuning on tensors with type '%s' is not yet supported.\n",
660
+ __func__, ggml_type_name(a->type));
661
+ }
662
+ };
663
+
664
+ struct ggml_tensor * tok_embeddings = add_to_f32(ctx, model->tok_embeddings, ggml_mul_mat(ctx, lora->tok_embeddings_a, lora->tok_embeddings_b));
665
+ struct ggml_tensor * norm = add_to_f32(ctx, model->norm, ggml_mul_mat(ctx, lora->norm_a, lora->norm_b));
666
+ struct ggml_tensor * output = add_to_f32(ctx, model->output, ggml_mul_mat(ctx, lora->output_a, lora->output_b));
667
+
668
+ struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch); set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch);
669
+ struct ggml_tensor * t01 = ggml_get_rows(ctx, tok_embeddings, t00); set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch);
670
+
671
+ struct ggml_tensor * cur = t01;
672
+
673
+ std::vector<struct ggml_tensor *> checkpoints;
674
+ if (enable_checkpointing) {
675
+ checkpoints.push_back(tokens_input);
676
+ checkpoints.push_back(targets);
677
+ checkpoints.push_back(t00);
678
+ checkpoints.push_back(t01);
679
+ }
680
+
681
+ struct ggml_tensor * kv_scale = NULL;
682
+ if (!enable_flash_attn) {
683
+ kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head));
684
+ }
685
+
686
+ for (int il = 0; il < n_layer; ++il) {
687
+ struct my_llama_layer & layer = model->layers[il];
688
+ struct my_llama_lora_layer & llayer = lora->layers[il];
689
+
690
+ struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b));
691
+ struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b));
692
+ struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
693
+ struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
694
+ struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
695
+ struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
696
+ struct ggml_tensor * w1 = add_to_f32(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b));
697
+ struct ggml_tensor * w2 = add_to_f32(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b));
698
+ struct ggml_tensor * w3 = add_to_f32(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b));
699
+
700
+ struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch);
701
+ struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch);
702
+ struct ggml_tensor * t04 = ggml_mul (ctx, t03, t02); set_name(t04, "t04"); assert_shape_2d(t04, n_embd, N*n_batch);
703
+ struct ggml_tensor * t05 = ggml_mul_mat (ctx, wq, t04); set_name(t05, "t05"); assert_shape_2d(t05, n_embd, N*n_batch);
704
+ struct ggml_tensor * t06 = ggml_reshape_4d (ctx, t05, n_embd_head, n_head, N, n_batch); set_name(t06, "t06"); assert_shape_4d(t06, n_embd_head, n_head, N, n_batch);
705
+ struct ggml_tensor * t07 = rope (t06); set_name(t07, "t07"); assert_shape_4d(t07, n_embd_head, n_head, N, n_batch);
706
+ struct ggml_tensor * t08 = ggml_mul_mat (ctx, wk, t04); set_name(t08, "t08"); assert_shape_2d(t08, n_embd_gqa, N*n_batch);
707
+ struct ggml_tensor * t09 = ggml_reshape_4d (ctx, t08, n_embd_head, n_head_kv, N, n_batch); set_name(t09, "t09"); assert_shape_4d(t09, n_embd_head, n_head_kv, N, n_batch);
708
+ struct ggml_tensor * t10 = rope (t09); set_name(t10, "t10"); assert_shape_4d(t10, n_embd_head, n_head_kv, N, n_batch);
709
+
710
+ struct ggml_tensor * t11;
711
+ if (ggml_is_quantized(wv->type)) {
712
+ struct ggml_tensor * t11_1 = ggml_mul_mat (ctx, wv, t04); set_name(t11_1, "t11_1"); assert_shape_2d(t11_1, n_embd_gqa, N*n_batch);
713
+ struct ggml_tensor * t11_2 = ggml_transpose(ctx, t11_1); set_name(t11_2, "t11_2"); assert_shape_2d(t11_2, N*n_batch, n_embd_gqa);
714
+ t11 = ggml_cont (ctx, t11_2); set_name(t11, "t11"); assert_shape_2d(t11, N*n_batch, n_embd_gqa);
715
+ } else {
716
+ t11 = ggml_mul_mat (ctx, t04, wv); set_name(t11, "t11"); assert_shape_2d(t11, N*n_batch, n_embd_gqa);
717
+ }
718
+
719
+ struct ggml_tensor * t12 = ggml_reshape_4d (ctx, t11, N, n_batch, n_embd_head, n_head_kv); set_name(t12, "t12"); assert_shape_4d(t12, N, n_batch, n_embd_head, n_head_kv);
720
+ struct ggml_tensor * t13 = ggml_permute (ctx, t07, 0, 2, 1, 3); set_name(t13, "t13"); assert_shape_4d(t13, n_embd_head, N, n_head, n_batch);
721
+ struct ggml_tensor * t14 = ggml_permute (ctx, t10, 0, 2, 1, 3); set_name(t14, "t14"); assert_shape_4d(t14, n_embd_head, N, n_head_kv, n_batch);
722
+ struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
723
+ struct ggml_tensor * t16;
724
+ if (enable_flash_attn) {
725
+ t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
726
+ } else {
727
+ struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
728
+ struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
729
+ struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past); set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch);
730
+ struct ggml_tensor * t16_3 = ggml_soft_max_inplace (ctx, t16_2); set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch);
731
+ t16 = ggml_mul_mat(ctx, t15, t16_3); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
732
+ }
733
+ struct ggml_tensor * t17 = ggml_permute (ctx, t16, 0, 2, 1, 3); set_name(t17, "t17"); assert_shape_4d(t17, n_embd_head, n_head, N, n_batch);
734
+ struct ggml_tensor * t18 = ggml_cont (ctx, t17); set_name(t18, "t18"); assert_shape_4d(t18, n_embd_head, n_head, N, n_batch);
735
+ struct ggml_tensor * t19 = ggml_reshape_2d (ctx, t18, n_embd, N*n_batch); set_name(t19, "t19"); assert_shape_2d(t19, n_embd, N*n_batch);
736
+ struct ggml_tensor * t20 = ggml_mul_mat (ctx, wo, t19); set_name(t20, "t20"); assert_shape_2d(t20, n_embd, N*n_batch);
737
+ struct ggml_tensor * t21 = ggml_add (ctx, t20, cur); set_name(t21, "t21"); assert_shape_2d(t21, n_embd, N*n_batch);
738
+ struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch);
739
+ struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch);
740
+ struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch);
741
+ struct ggml_tensor * t25 = ggml_mul_mat (ctx, w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
742
+ struct ggml_tensor * t26 = ggml_mul_mat (ctx, w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
743
+ struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch);
744
+ struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch);
745
+ struct ggml_tensor * t29 = ggml_mul_mat (ctx, w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
746
+ struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch);
747
+ cur = t30;
748
+ if (enable_checkpointing) {
749
+ checkpoints.push_back(cur);
750
+ }
751
+ }
752
+ struct ggml_tensor * t31 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t31, "t31"); assert_shape_2d(t31, n_embd, N*n_batch);
753
+ struct ggml_tensor * t32 = ggml_repeat (ctx, norm, t31); set_name(t32, "t32"); assert_shape_2d(t32, n_embd, N*n_batch);
754
+ struct ggml_tensor * t33 = ggml_mul (ctx, t32, t31); set_name(t33, "t33"); assert_shape_2d(t33, n_embd, N*n_batch);
755
+ struct ggml_tensor * t34 = ggml_mul_mat (ctx, output, t33); set_name(t34, "t34"); assert_shape_2d(t34, n_vocab, N*n_batch);
756
+ struct ggml_tensor * t35 = ggml_reshape_3d (ctx, t34, n_vocab, N, n_batch); set_name(t35, "t35"); assert_shape_3d(t35, n_vocab, N, n_batch);
757
+ struct ggml_tensor * t36 = ggml_cross_entropy_loss(ctx, t35, targets); set_name(t36, "t36"); assert_shape_1d(t36, 1);
758
+
759
+ if (enable_checkpointing) {
760
+ checkpoints.push_back(t31);
761
+ checkpoints.push_back(t32);
762
+ checkpoints.push_back(t33);
763
+ checkpoints.push_back(t34);
764
+ checkpoints.push_back(t35);
765
+ checkpoints.push_back(t36);
766
+ }
767
+
768
+ ggml_build_forward_expand(gf, t36);
769
+
770
+ if (enable_checkpointing) {
771
+ ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
772
+ } else {
773
+ *gb = *gf;
774
+ ggml_build_backward_expand(ctx, gf, gb, true);
775
+ }
776
+
777
+ GGML_ASSERT(alloc != NULL);
778
+
779
+ // make sure some tensors are not reallocated by inserting new temporary nodes depending on them
780
+ int n_leafs_before = gb->n_leafs;
781
+ int n_nodes_before = gb->n_nodes;
782
+ struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f);
783
+ // output tensors
784
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one));
785
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
786
+ // input gradient
787
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
788
+ GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
789
+ ggml_allocr_alloc(alloc, t36->grad);
790
+ // KQ_pos
791
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one));
792
+
793
+ // make sure base model tensors data cannot be used in viewable operations
794
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one));
795
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, one));
796
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, one));
797
+ for (int il = 0; il < n_layer; ++il) {
798
+ struct my_llama_layer & layer = model->layers[il];
799
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, one));
800
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, one));
801
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, one));
802
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, one));
803
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, one));
804
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, one));
805
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, one));
806
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, one));
807
+ ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, one));
808
+ }
809
+
810
+ // allocating checkpoints in one block to reduce memory fragmentation
811
+ // note: they will be freed in reverse order
812
+ for (unsigned int i = 0; i < checkpoints.size(); ++i) {
813
+ if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
814
+ ggml_allocr_alloc(alloc, checkpoints[i]);
815
+ }
816
+ }
817
+
818
+ ggml_allocr_alloc_graph(alloc, gb);
819
+
820
+ // remove the additional nodes and leafs
821
+ for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
822
+ gb->leafs[i] = NULL;
823
+ }
824
+ for (int i = n_nodes_before; i < gb->n_nodes; ++i) {
825
+ gb->nodes[i] = NULL;
826
+ }
827
+ gb->n_leafs = n_leafs_before;
828
+ gb->n_nodes = n_nodes_before;
829
+
830
+ *logits = t35;
831
+ return t36;
832
+ }
833
+
834
+ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) {
835
+ // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
836
+
837
+ std::string arch;
838
+
839
+ std::vector<char> keybuf;
840
+ keybuf.resize(512);
841
+
842
+ GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE);
843
+ GGML_ASSERT(arch == "llama");
844
+
845
+ uint32_t ftype_u;
846
+ GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE);
847
+ GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32);
848
+
849
+ struct my_llama_hparams hparams;
850
+ load_model_hparams_gguf(fctx, &hparams, arch.c_str());
851
+
852
+ // parameters that define tensor shapes must match
853
+ GGML_ASSERT(hparams.n_embd == model->hparams.n_embd);
854
+ GGML_ASSERT(hparams.n_ff == model->hparams.n_ff);
855
+ GGML_ASSERT(hparams.n_head == model->hparams.n_head);
856
+ GGML_ASSERT(hparams.n_head_kv == model->hparams.n_head_kv);
857
+ GGML_ASSERT(hparams.n_layer == model->hparams.n_layer);
858
+
859
+ GGUF_GET_KEY(fctx, lora->hparams.n_rank_tok_embeddings, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD);
860
+ GGUF_GET_KEY(fctx, lora->hparams.n_rank_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM);
861
+ GGUF_GET_KEY(fctx, lora->hparams.n_rank_output, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT);
862
+ GGUF_GET_KEY(fctx, lora->hparams.n_rank_attention_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM);
863
+ GGUF_GET_KEY(fctx, lora->hparams.n_rank_wq, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_Q);
864
+ GGUF_GET_KEY(fctx, lora->hparams.n_rank_wk, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_K);
865
+ GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V);
866
+ GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT);
867
+ GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM);
868
+ GGUF_GET_KEY(fctx, lora->hparams.n_rank_w1, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
869
+ GGUF_GET_KEY(fctx, lora->hparams.n_rank_w2, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
870
+ GGUF_GET_KEY(fctx, lora->hparams.n_rank_w3, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
871
+
872
+ init_lora(model, lora);
873
+
874
+ copy_tensor_by_name(lora->tok_embeddings_a, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_a));
875
+ copy_tensor_by_name(lora->tok_embeddings_b, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_b));
876
+ copy_tensor_by_name(lora->norm_a, f_ggml_ctx, ggml_get_name(lora->norm_a));
877
+ copy_tensor_by_name(lora->norm_b, f_ggml_ctx, ggml_get_name(lora->norm_b));
878
+ copy_tensor_by_name(lora->output_a, f_ggml_ctx, ggml_get_name(lora->output_a));
879
+ copy_tensor_by_name(lora->output_b, f_ggml_ctx, ggml_get_name(lora->output_b));
880
+
881
+ for (uint32_t i = 0; i < lora->layers.size(); ++i) {
882
+ auto & layer = lora->layers[i];
883
+ copy_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a));
884
+ copy_tensor_by_name(layer.attention_norm_b, f_ggml_ctx, ggml_get_name(layer.attention_norm_b));
885
+ copy_tensor_by_name(layer.wq_a, f_ggml_ctx, ggml_get_name(layer.wq_a));
886
+ copy_tensor_by_name(layer.wq_b, f_ggml_ctx, ggml_get_name(layer.wq_b));
887
+ copy_tensor_by_name(layer.wk_a, f_ggml_ctx, ggml_get_name(layer.wk_a));
888
+ copy_tensor_by_name(layer.wk_b, f_ggml_ctx, ggml_get_name(layer.wk_b));
889
+ copy_tensor_by_name(layer.wv_a, f_ggml_ctx, ggml_get_name(layer.wv_a));
890
+ copy_tensor_by_name(layer.wv_b, f_ggml_ctx, ggml_get_name(layer.wv_b));
891
+ copy_tensor_by_name(layer.wo_a, f_ggml_ctx, ggml_get_name(layer.wo_a));
892
+ copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b));
893
+ copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a));
894
+ copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b));
895
+ copy_tensor_by_name(layer.w1_a, f_ggml_ctx, ggml_get_name(layer.w1_a));
896
+ copy_tensor_by_name(layer.w1_b, f_ggml_ctx, ggml_get_name(layer.w1_b));
897
+ copy_tensor_by_name(layer.w2_a, f_ggml_ctx, ggml_get_name(layer.w2_a));
898
+ copy_tensor_by_name(layer.w2_b, f_ggml_ctx, ggml_get_name(layer.w2_b));
899
+ copy_tensor_by_name(layer.w3_a, f_ggml_ctx, ggml_get_name(layer.w3_a));
900
+ copy_tensor_by_name(layer.w3_b, f_ggml_ctx, ggml_get_name(layer.w3_b));
901
+ }
902
+ }
903
+
904
+ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora) {
905
+ const char * arch = "llama";
906
+ enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
907
+
908
+ std::vector<char> keybuf;
909
+ keybuf.resize(512);
910
+ auto kv = [arch, &keybuf](const char * key) -> const char * {
911
+ snprintf(keybuf.data(), keybuf.size(), key, arch);
912
+ return keybuf.data();
913
+ };
914
+
915
+ gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch);
916
+ gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);
917
+
918
+ gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.n_ctx);
919
+ gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH), model->hparams.n_embd);
920
+ gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH), model->hparams.n_ff);
921
+ gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT), model->hparams.n_head);
922
+ gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV), model->hparams.n_head_kv);
923
+ gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT), model->hparams.n_layer);
924
+ gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT), model->hparams.n_embd_head());
925
+ gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps);
926
+ gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE), model->hparams.rope_freq_base);
927
+ gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR), model->hparams.rope_freq_scale);
928
+
929
+ gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD, lora->hparams.n_rank_tok_embeddings);
930
+ gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, lora->hparams.n_rank_norm);
931
+ gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT, lora->hparams.n_rank_output);
932
+ gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM, lora->hparams.n_rank_attention_norm);
933
+ gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_Q, lora->hparams.n_rank_wq);
934
+ gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_K, lora->hparams.n_rank_wk);
935
+ gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv);
936
+ gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo);
937
+ gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm);
938
+ gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_w1);
939
+ gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_w2);
940
+ gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_w3);
941
+
942
+ gguf_add_tensor(fctx, lora->tok_embeddings_a);
943
+ gguf_add_tensor(fctx, lora->tok_embeddings_b);
944
+ gguf_add_tensor(fctx, lora->norm_a);
945
+ gguf_add_tensor(fctx, lora->norm_b);
946
+ gguf_add_tensor(fctx, lora->output_a);
947
+ gguf_add_tensor(fctx, lora->output_b);
948
+
949
+ for (uint32_t i = 0; i < lora->layers.size(); ++i) {
950
+ auto & layer = lora->layers[i];
951
+
952
+ gguf_add_tensor(fctx, layer.attention_norm_a);
953
+ gguf_add_tensor(fctx, layer.attention_norm_b);
954
+ gguf_add_tensor(fctx, layer.wq_a);
955
+ gguf_add_tensor(fctx, layer.wq_b);
956
+ gguf_add_tensor(fctx, layer.wk_a);
957
+ gguf_add_tensor(fctx, layer.wk_b);
958
+ gguf_add_tensor(fctx, layer.wv_a);
959
+ gguf_add_tensor(fctx, layer.wv_b);
960
+ gguf_add_tensor(fctx, layer.wo_a);
961
+ gguf_add_tensor(fctx, layer.wo_b);
962
+ gguf_add_tensor(fctx, layer.ffn_norm_a);
963
+ gguf_add_tensor(fctx, layer.ffn_norm_b);
964
+ gguf_add_tensor(fctx, layer.w1_a);
965
+ gguf_add_tensor(fctx, layer.w1_b);
966
+ gguf_add_tensor(fctx, layer.w2_a);
967
+ gguf_add_tensor(fctx, layer.w2_b);
968
+ gguf_add_tensor(fctx, layer.w3_a);
969
+ gguf_add_tensor(fctx, layer.w3_b);
970
+ }
971
+ }
972
+
973
+ static void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
974
+ std::string train_type = LLM_KV_TRAINING_TYPE_FINETUNE_LORA;
975
+ GGUF_GET_KEY(fctx, train_type, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
976
+ GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
977
+
978
+ load_train_state_gguf(fctx, f_ggml_ctx, train);
979
+ load_llama_lora_gguf(fctx, f_ggml_ctx, model, lora);
980
+ }
981
+
982
+ static void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
983
+ gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
984
+ save_llama_lora_gguf(fctx, model, lora);
985
+ save_train_state_gguf(fctx, train);
986
+ }
987
+
988
+ static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
989
+ struct ggml_context * f_ggml_ctx;
990
+ struct gguf_init_params params;
991
+ params.no_alloc = false;
992
+ params.ctx = &f_ggml_ctx;
993
+ struct gguf_context * fctx = gguf_init_from_file(filename, params);
994
+ if (fctx == NULL) {
995
+ return false;
996
+ }
997
+
998
+ load_checkpoint_lora_gguf(fctx, f_ggml_ctx, model, lora, train);
999
+
1000
+ gguf_free(fctx);
1001
+ return true;
1002
+ }
1003
+
1004
+ static void save_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
1005
+ printf("%s: saving to %s\n", __func__, filename);
1006
+ struct gguf_context * fctx = gguf_init_empty();
1007
+
1008
+ save_checkpoint_lora_gguf(fctx, model, lora, train);
1009
+
1010
+ // write file
1011
+ const bool only_meta = false;
1012
+ gguf_write_to_file(fctx, filename, only_meta);
1013
+ gguf_free(fctx);
1014
+ }
1015
+
1016
+ struct llama_file {
1017
+ // use FILE * so we don't have to re-open the file to mmap
1018
+ FILE * fp;
1019
+ size_t size;
1020
+
1021
+ llama_file(const char * fname, const char * mode) {
1022
+ fp = std::fopen(fname, mode);
1023
+ if (fp == NULL) {
1024
+ size = 0;
1025
+ } else {
1026
+ seek(0, SEEK_END);
1027
+ size = tell();
1028
+ seek(0, SEEK_SET);
1029
+ }
1030
+ }
1031
+
1032
+ size_t tell() const {
1033
+ #ifdef _WIN32
1034
+ __int64 ret = _ftelli64(fp);
1035
+ #else
1036
+ long ret = std::ftell(fp);
1037
+ #endif
1038
+ GGML_ASSERT(ret != -1); // this really shouldn't fail
1039
+ return (size_t) ret;
1040
+ }
1041
+
1042
+ void seek(size_t offset, int whence) {
1043
+ #ifdef _WIN32
1044
+ int ret = _fseeki64(fp, (__int64) offset, whence);
1045
+ #else
1046
+ int ret = std::fseek(fp, (long) offset, whence);
1047
+ #endif
1048
+ GGML_ASSERT(ret == 0); // same
1049
+ }
1050
+
1051
+ void read_raw(void * ptr, size_t size) {
1052
+ if (size == 0) {
1053
+ return;
1054
+ }
1055
+ errno = 0;
1056
+ std::size_t ret = std::fread(ptr, size, 1, fp);
1057
+ if (ferror(fp)) {
1058
+ die_fmt("read error: %s", strerror(errno));
1059
+ }
1060
+ if (ret != 1) {
1061
+ die("unexpectedly reached end of file");
1062
+ }
1063
+ }
1064
+
1065
+ std::uint32_t read_u32() {
1066
+ std::uint32_t ret;
1067
+ read_raw(&ret, sizeof(ret));
1068
+ return ret;
1069
+ }
1070
+
1071
+ std::string read_string(std::uint32_t len) {
1072
+ std::vector<char> chars(len);
1073
+ read_raw(chars.data(), len);
1074
+ return std::string(chars.data(), len);
1075
+ }
1076
+
1077
+ void write_raw(const void * ptr, size_t size) {
1078
+ if (size == 0) {
1079
+ return;
1080
+ }
1081
+ errno = 0;
1082
+ size_t ret = std::fwrite(ptr, size, 1, fp);
1083
+ if (ret != 1) {
1084
+ die_fmt("write error: %s", strerror(errno));
1085
+ }
1086
+ }
1087
+
1088
+ void write_u32(std::uint32_t val) {
1089
+ write_raw(&val, sizeof(val));
1090
+ }
1091
+
1092
+ ~llama_file() {
1093
+ if (fp) {
1094
+ std::fclose(fp);
1095
+ }
1096
+ }
1097
+ };
1098
+
1099
+ static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, const char * name) {
1100
+ if (tensor == NULL) {
1101
+ file->write_u32(0);
1102
+ file->write_u32(0);
1103
+ file->write_u32(GGML_TYPE_F32);
1104
+ file->seek((0-file->tell()) & 31, SEEK_CUR);
1105
+ return;
1106
+ }
1107
+ if (name == NULL) {
1108
+ name = ggml_get_name(tensor);
1109
+ }
1110
+ uint32_t name_len = strlen(name);
1111
+ uint32_t nd = tensor->n_dims;
1112
+ uint32_t ne[4] = { (uint32_t)tensor->ne[0],
1113
+ (uint32_t)tensor->ne[1],
1114
+ (uint32_t)tensor->ne[2],
1115
+ (uint32_t)tensor->ne[3] };
1116
+ file->write_u32(nd);
1117
+ file->write_u32(name_len);
1118
+ file->write_u32(tensor->type);
1119
+ file->write_raw(ne, sizeof(ne[0]) * nd);
1120
+ file->write_raw(name, name_len);
1121
+ file->seek((0-file->tell()) & 31, SEEK_CUR);
1122
+ file->write_raw(tensor->data, ggml_nbytes(tensor));
1123
+ }
1124
+
1125
+ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lora) {
1126
+ printf("%s: saving to %s\n", __func__, filename);
1127
+ struct llama_file file(filename, "wb");
1128
+ if (file.fp == NULL) {
1129
+ return;
1130
+ }
1131
+
1132
+ std::vector<char> tn_buf;
1133
+ tn_buf.resize(GGML_MAX_NAME);
1134
+
1135
+ auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * {
1136
+ snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix);
1137
+ return tn_buf.data();
1138
+ };
1139
+
1140
+ auto tni = [&tn_buf](const char * key, int bid, const char * suffix) -> const char * {
1141
+ snprintf(tn_buf.data(), tn_buf.size(), key, bid);
1142
+ std::string s = tn_buf.data();
1143
+ snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix);
1144
+ return tn_buf.data();
1145
+ };
1146
+
1147
+ uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
1148
+ // write_magic
1149
+ file.write_u32(LLAMA_FILE_MAGIC_LORA); // magic
1150
+ file.write_u32(1); // version
1151
+ // write_hparams
1152
+ file.write_u32(lora->hparams.lora_r);
1153
+ file.write_u32(lora->hparams.lora_alpha);
1154
+ // write tensors
1155
+ write_tensor(&file, lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.loraA"));
1156
+ write_tensor(&file, lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.loraB"));
1157
+ write_tensor(&file, lora->norm_a, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraA"));
1158
+ write_tensor(&file, lora->norm_b, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraB"));
1159
+ write_tensor(&file, lora->output_a, tn(LLM_TENSOR_OUTPUT, ".weight.loraA"));
1160
+ write_tensor(&file, lora->output_b, tn(LLM_TENSOR_OUTPUT, ".weight.loraB"));
1161
+ for (uint32_t i = 0; i < lora->layers.size(); ++i) {
1162
+ auto & layer = lora->layers[i];
1163
+ write_tensor(&file, layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraA"));
1164
+ write_tensor(&file, layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraB"));
1165
+ write_tensor(&file, layer.wq_a, tni(LLM_TENSOR_ATTN_Q, i, ".weight.loraA"));
1166
+ write_tensor(&file, layer.wq_b, tni(LLM_TENSOR_ATTN_Q, i, ".weight.loraB"));
1167
+ write_tensor(&file, layer.wk_a, tni(LLM_TENSOR_ATTN_K, i, ".weight.loraA"));
1168
+ write_tensor(&file, layer.wk_b, tni(LLM_TENSOR_ATTN_K, i, ".weight.loraB"));
1169
+ write_tensor(&file, layer.wv_a, tni(LLM_TENSOR_ATTN_V, i, ".weight.loraA"));
1170
+ write_tensor(&file, layer.wv_b, tni(LLM_TENSOR_ATTN_V, i, ".weight.loraB"));
1171
+ write_tensor(&file, layer.wo_a, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraA"));
1172
+ write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB"));
1173
+ write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA"));
1174
+ write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB"));
1175
+ write_tensor(&file, layer.w1_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA"));
1176
+ write_tensor(&file, layer.w1_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB"));
1177
+ write_tensor(&file, layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA"));
1178
+ write_tensor(&file, layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB"));
1179
+ write_tensor(&file, layer.w3_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA"));
1180
+ write_tensor(&file, layer.w3_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB"));
1181
+ }
1182
+ }
1183
+
1184
+ struct train_params {
1185
+ struct train_params_common common;
1186
+
1187
+ const char * fn_model_base;
1188
+ const char * fn_lora_out;
1189
+
1190
+ bool only_write_lora;
1191
+
1192
+ float f_norm_rms_eps;
1193
+ float rope_freq_base;
1194
+ float rope_freq_scale;
1195
+
1196
+ bool custom_f_norm_rms_eps;
1197
+ bool custom_rope_freq_base;
1198
+ bool custom_rope_freq_scale;
1199
+
1200
+ int32_t lora_r;
1201
+ int32_t lora_alpha;
1202
+ bool custom_lora_alpha;
1203
+
1204
+ uint32_t n_rank_attention_norm;
1205
+ uint32_t n_rank_wq;
1206
+ uint32_t n_rank_wk;
1207
+ uint32_t n_rank_wv;
1208
+ uint32_t n_rank_wo;
1209
+ uint32_t n_rank_ffn_norm;
1210
+ uint32_t n_rank_w1;
1211
+ uint32_t n_rank_w2;
1212
+ uint32_t n_rank_w3;
1213
+ uint32_t n_rank_tok_embeddings;
1214
+ uint32_t n_rank_norm;
1215
+ uint32_t n_rank_output;
1216
+
1217
+ bool custom_n_rank_attention_norm;
1218
+ bool custom_n_rank_wq;
1219
+ bool custom_n_rank_wk;
1220
+ bool custom_n_rank_wv;
1221
+ bool custom_n_rank_wo;
1222
+ bool custom_n_rank_ffn_norm;
1223
+ bool custom_n_rank_w1;
1224
+ bool custom_n_rank_w2;
1225
+ bool custom_n_rank_w3;
1226
+ bool custom_n_rank_tok_embeddings;
1227
+ bool custom_n_rank_norm;
1228
+ bool custom_n_rank_output;
1229
+ };
1230
+
1231
+ static struct train_params get_default_train_params() {
1232
+ struct train_params params;
1233
+ params.common = get_default_train_params_common();
1234
+ params.fn_model_base = "";
1235
+ params.fn_lora_out = "ggml-lora-ITERATION-f32.gguf";
1236
+
1237
+ params.only_write_lora = false;
1238
+
1239
+ params.f_norm_rms_eps = 1e-5f;
1240
+ params.rope_freq_base = 10000.0f;
1241
+ params.rope_freq_scale = 1.0f;
1242
+
1243
+ params.custom_f_norm_rms_eps = false;
1244
+ params.custom_rope_freq_base = false;
1245
+ params.custom_rope_freq_scale = false;
1246
+
1247
+ params.lora_r = 4;
1248
+ params.lora_alpha = 4;
1249
+ params.custom_lora_alpha = false;
1250
+
1251
+ params.n_rank_attention_norm = 1;
1252
+ params.n_rank_wq = 4;
1253
+ params.n_rank_wk = 4;
1254
+ params.n_rank_wv = 4;
1255
+ params.n_rank_wo = 4;
1256
+ params.n_rank_ffn_norm = 1;
1257
+ params.n_rank_w1 = 4;
1258
+ params.n_rank_w2 = 4;
1259
+ params.n_rank_w3 = 4;
1260
+ params.n_rank_tok_embeddings = 4;
1261
+ params.n_rank_norm = 1;
1262
+ params.n_rank_output = 4;
1263
+
1264
+ params.custom_n_rank_attention_norm = false;
1265
+ params.custom_n_rank_wq = false;
1266
+ params.custom_n_rank_wk = false;
1267
+ params.custom_n_rank_wv = false;
1268
+ params.custom_n_rank_wo = false;
1269
+ params.custom_n_rank_ffn_norm = false;
1270
+ params.custom_n_rank_w1 = false;
1271
+ params.custom_n_rank_w2 = false;
1272
+ params.custom_n_rank_w3 = false;
1273
+ params.custom_n_rank_tok_embeddings = false;
1274
+ params.custom_n_rank_norm = false;
1275
+ params.custom_n_rank_output = false;
1276
+
1277
+ return params;
1278
+ }
1279
+
1280
+ static void train_print_usage(int argc, char ** argv, const struct train_params * params) {
1281
+ fprintf(stderr, "usage: %s [options]\n", argv[0]);
1282
+ fprintf(stderr, "\n");
1283
+ fprintf(stderr, "options:\n");
1284
+ fprintf(stderr, " -h, --help show this help message and exit\n");
1285
+
1286
+ fprintf(stderr, " --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base);
1287
+ fprintf(stderr, " --lora-out FNAME path to save llama lora (default '%s')\n", params->fn_lora_out);
1288
+ fprintf(stderr, " --only-write-lora only save llama lora, don't do any training. use this if you only want to convert a checkpoint to a lora adapter.\n");
1289
+ fprintf(stderr, " --norm-rms-eps F RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps);
1290
+ fprintf(stderr, " --rope-freq-base F Frequency base for ROPE (default %f)\n", params->rope_freq_base);
1291
+ fprintf(stderr, " --rope-freq-scale F Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
1292
+ fprintf(stderr, " --lora-alpha N LORA alpha : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_alpha);
1293
+ fprintf(stderr, " --lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default %d)\n", params->lora_r);
1294
+ fprintf(stderr, " --rank-att-norm N LORA rank for attention norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n");
1295
+ fprintf(stderr, " --rank-ffn-norm N LORA rank for feed-forward norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n");
1296
+ fprintf(stderr, " --rank-out-norm N LORA rank for output norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n");
1297
+ fprintf(stderr, " --rank-tok-embd N LORA rank for token embeddings tensor, overrides default rank.\n");
1298
+ fprintf(stderr, " --rank-out N LORA rank for output tensor, overrides default rank.\n");
1299
+ fprintf(stderr, " --rank-wq N LORA rank for wq tensor, overrides default rank.\n");
1300
+ fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n");
1301
+ fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n");
1302
+ fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n");
1303
+ fprintf(stderr, " --rank-w1 N LORA rank for w1 tensor, overrides default rank.\n");
1304
+ fprintf(stderr, " --rank-w2 N LORA rank for w2 tensor, overrides default rank.\n");
1305
+ fprintf(stderr, " --rank-w3 N LORA rank for w3 tensor, overrides default rank.\n");
1306
+
1307
+ print_common_train_usage(argc, argv, &params->common);
1308
+ }
1309
+
1310
+ static bool train_params_parse(int argc, char ** argv, struct train_params * params) {
1311
+ bool invalid_param = false;
1312
+ std::string arg;
1313
+ struct train_params default_params = get_default_train_params();
1314
+ const std::string arg_prefix = "--";
1315
+
1316
+ for (int i = 1; i < argc; i++) {
1317
+ arg = argv[i];
1318
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
1319
+ std::replace(arg.begin(), arg.end(), '_', '-');
1320
+ }
1321
+
1322
+ if (consume_common_train_arg(argc, argv, &i, &params->common, &invalid_param)) {
1323
+ if (invalid_param) {
1324
+ break;
1325
+ } else if (params->common.print_usage) {
1326
+ train_print_usage(argc, argv, &default_params);
1327
+ exit(0);
1328
+ }
1329
+ } else if (arg == "--model-base") {
1330
+ if (++i >= argc) {
1331
+ invalid_param = true;
1332
+ break;
1333
+ }
1334
+ params->fn_model_base = argv[i];
1335
+ } else if (arg == "--lora-out") {
1336
+ if (++i >= argc) {
1337
+ invalid_param = true;
1338
+ break;
1339
+ }
1340
+ params->fn_lora_out = argv[i];
1341
+ } else if (arg == "--only-write-lora") {
1342
+ params->only_write_lora = true;
1343
+ } else if (arg == "--norm-rms-eps") {
1344
+ if (++i >= argc) {
1345
+ invalid_param = true;
1346
+ break;
1347
+ }
1348
+ params->f_norm_rms_eps = std::stof(argv[i]);
1349
+ params->custom_f_norm_rms_eps = true;
1350
+ } else if (arg == "--rope-freq-base") {
1351
+ if (++i >= argc) {
1352
+ invalid_param = true;
1353
+ break;
1354
+ }
1355
+ params->rope_freq_base = std::stof(argv[i]);
1356
+ params->custom_rope_freq_base = true;
1357
+ } else if (arg == "--rope-freq-scale") {
1358
+ if (++i >= argc) {
1359
+ invalid_param = true;
1360
+ break;
1361
+ }
1362
+ params->rope_freq_scale = std::stof(argv[i]);
1363
+ params->custom_rope_freq_scale = true;
1364
+ } else if (arg == "--lora-alpha") {
1365
+ if (++i >= argc) {
1366
+ invalid_param = true;
1367
+ break;
1368
+ }
1369
+ params->lora_alpha = std::stoi(argv[i]);
1370
+ params->custom_lora_alpha = true;
1371
+ } else if (arg == "--lora-r") {
1372
+ if (++i >= argc) {
1373
+ invalid_param = true;
1374
+ break;
1375
+ }
1376
+ params->lora_r = std::stoi(argv[i]);
1377
+ } else if (arg == "--rank-att-norm") {
1378
+ if (++i >= argc) {
1379
+ invalid_param = true;
1380
+ break;
1381
+ }
1382
+ params->n_rank_attention_norm = std::stoi(argv[i]);
1383
+ params->custom_n_rank_attention_norm = true;
1384
+ } else if (arg == "--rank-ffn-norm") {
1385
+ if (++i >= argc) {
1386
+ invalid_param = true;
1387
+ break;
1388
+ }
1389
+ params->n_rank_ffn_norm = std::stoi(argv[i]);
1390
+ params->custom_n_rank_ffn_norm = true;
1391
+ } else if (arg == "--rank-out-norm") {
1392
+ if (++i >= argc) {
1393
+ invalid_param = true;
1394
+ break;
1395
+ }
1396
+ params->n_rank_norm = std::stoi(argv[i]);
1397
+ params->custom_n_rank_norm = true;
1398
+ } else if (arg == "--rank-tok-embd") {
1399
+ if (++i >= argc) {
1400
+ invalid_param = true;
1401
+ break;
1402
+ }
1403
+ params->n_rank_tok_embeddings = std::stoi(argv[i]);
1404
+ params->custom_n_rank_tok_embeddings = true;
1405
+ } else if (arg == "--rank-out") {
1406
+ if (++i >= argc) {
1407
+ invalid_param = true;
1408
+ break;
1409
+ }
1410
+ params->n_rank_output = std::stoi(argv[i]);
1411
+ params->custom_n_rank_output = true;
1412
+ } else if (arg == "--rank-wq") {
1413
+ if (++i >= argc) {
1414
+ invalid_param = true;
1415
+ break;
1416
+ }
1417
+ params->n_rank_wq = std::stoi(argv[i]);
1418
+ params->custom_n_rank_wq = true;
1419
+ } else if (arg == "--rank-wk") {
1420
+ if (++i >= argc) {
1421
+ invalid_param = true;
1422
+ break;
1423
+ }
1424
+ params->n_rank_wk = std::stoi(argv[i]);
1425
+ params->custom_n_rank_wk = true;
1426
+ } else if (arg == "--rank-wv") {
1427
+ if (++i >= argc) {
1428
+ invalid_param = true;
1429
+ break;
1430
+ }
1431
+ params->n_rank_wv = std::stoi(argv[i]);
1432
+ params->custom_n_rank_wv = true;
1433
+ } else if (arg == "--rank-wo") {
1434
+ if (++i >= argc) {
1435
+ invalid_param = true;
1436
+ break;
1437
+ }
1438
+ params->n_rank_wo = std::stoi(argv[i]);
1439
+ params->custom_n_rank_wo = true;
1440
+ } else if (arg == "--rank-w1") {
1441
+ if (++i >= argc) {
1442
+ invalid_param = true;
1443
+ break;
1444
+ }
1445
+ params->n_rank_w1 = std::stoi(argv[i]);
1446
+ params->custom_n_rank_w1 = true;
1447
+ } else if (arg == "--rank-w2") {
1448
+ if (++i >= argc) {
1449
+ invalid_param = true;
1450
+ break;
1451
+ }
1452
+ params->n_rank_w2 = std::stoi(argv[i]);
1453
+ params->custom_n_rank_w2 = true;
1454
+ } else if (arg == "--rank-w3") {
1455
+ if (++i >= argc) {
1456
+ invalid_param = true;
1457
+ break;
1458
+ }
1459
+ params->n_rank_w3 = std::stoi(argv[i]);
1460
+ params->custom_n_rank_w3 = true;
1461
+ } else {
1462
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
1463
+ train_print_usage(argc, argv, &default_params);
1464
+ exit(1);
1465
+ }
1466
+ }
1467
+ if (invalid_param) {
1468
+ fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
1469
+ train_print_usage(argc, argv, &default_params);
1470
+ exit(1);
1471
+ }
1472
+ finish_processing_train_args(&params->common);
1473
+ return true;
1474
+ }
1475
+
1476
+ struct save_train_files_data {
1477
+ const char * fn_checkpoint_out;
1478
+ const char * fn_lora_out;
1479
+ const char * pattern_fn_it;
1480
+ const char * fn_latest;
1481
+ struct my_llama_model * model;
1482
+ struct my_llama_lora * lora;
1483
+ };
1484
+
1485
+ static void save_train_files(void * vdata, struct train_state * train) {
1486
+ struct save_train_files_data * data = (struct save_train_files_data *) vdata;
1487
+
1488
+ int64_t iter = train->opt->iter;
1489
+
1490
+ if (strlen(data->fn_checkpoint_out) > 0) {
1491
+ save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->model, data->lora, train);
1492
+ save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->model, data->lora, train);
1493
+ }
1494
+ if (strlen(data->fn_lora_out) > 0) {
1495
+ save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->lora);
1496
+ save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->lora);
1497
+ }
1498
+ }
1499
+
1500
+ static int64_t get_parameter_count(struct my_llama_lora* lora) {
1501
+ int64_t nx = 0;
1502
+ nx += ggml_nelements(lora->tok_embeddings_a);
1503
+ nx += ggml_nelements(lora->tok_embeddings_b);
1504
+ nx += ggml_nelements(lora->norm_a);
1505
+ nx += ggml_nelements(lora->norm_b);
1506
+ nx += ggml_nelements(lora->output_a);
1507
+ nx += ggml_nelements(lora->output_b);
1508
+
1509
+ for (uint32_t i = 0; i < lora->layers.size(); ++i) {
1510
+ auto & layer = lora->layers[i];
1511
+ nx += ggml_nelements(layer.attention_norm_a);
1512
+ nx += ggml_nelements(layer.attention_norm_b);
1513
+ nx += ggml_nelements(layer.wq_a);
1514
+ nx += ggml_nelements(layer.wq_b);
1515
+ nx += ggml_nelements(layer.wk_a);
1516
+ nx += ggml_nelements(layer.wk_b);
1517
+ nx += ggml_nelements(layer.wv_a);
1518
+ nx += ggml_nelements(layer.wv_b);
1519
+ nx += ggml_nelements(layer.wo_a);
1520
+ nx += ggml_nelements(layer.wo_b);
1521
+ nx += ggml_nelements(layer.ffn_norm_a);
1522
+ nx += ggml_nelements(layer.ffn_norm_b);
1523
+ nx += ggml_nelements(layer.w1_a);
1524
+ nx += ggml_nelements(layer.w1_b);
1525
+ nx += ggml_nelements(layer.w2_a);
1526
+ nx += ggml_nelements(layer.w2_b);
1527
+ nx += ggml_nelements(layer.w3_a);
1528
+ nx += ggml_nelements(layer.w3_b);
1529
+ }
1530
+ return nx;
1531
+ }
1532
+
1533
+ int main(int argc, char ** argv) {
1534
+ struct train_params params = get_default_train_params();
1535
+
1536
+ if (!train_params_parse(argc, argv, &params)) {
1537
+ return 1;
1538
+ }
1539
+
1540
+ if (params.common.seed == LLAMA_DEFAULT_SEED) {
1541
+ params.common.seed = time(NULL);
1542
+ }
1543
+ printf("%s: seed: %u\n", __func__, params.common.seed);
1544
+ srand(params.common.seed);
1545
+
1546
+ struct llama_model_params llama_mparams = llama_model_default_params();
1547
+ llama_mparams.vocab_only = false;
1548
+
1549
+ printf("%s: model base = '%s'\n", __func__, params.fn_model_base);
1550
+ struct llama_model * lmodel = llama_load_model_from_file(params.fn_model_base, llama_mparams);
1551
+
1552
+ struct llama_context_params llama_cparams = llama_context_default_params();
1553
+ struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_cparams);
1554
+
1555
+ struct my_llama_model model;
1556
+ init_model(lmodel, &model, params.fn_model_base, params.common.n_ctx);
1557
+
1558
+ struct my_llama_lora lora;
1559
+
1560
+ struct train_state * train = init_train_state();
1561
+ struct ggml_opt_context * opt = train->opt;
1562
+
1563
+ // set params from command line
1564
+ if (params.custom_f_norm_rms_eps) {
1565
+ model.hparams.f_norm_rms_eps = params.f_norm_rms_eps;
1566
+ }
1567
+ if (params.custom_rope_freq_base) {
1568
+ model.hparams.rope_freq_base = params.rope_freq_base;
1569
+ }
1570
+ if (params.custom_rope_freq_scale) {
1571
+ model.hparams.rope_freq_scale = params.rope_freq_scale;
1572
+ }
1573
+ lora.hparams.lora_r = params.lora_r;
1574
+ lora.hparams.lora_alpha = params.custom_lora_alpha ? params.lora_alpha : params.lora_r;
1575
+ uint32_t n_rank_attention_norm = params.custom_n_rank_attention_norm ? params.n_rank_attention_norm : 1;
1576
+ uint32_t n_rank_wq = params.custom_n_rank_wq ? params.n_rank_wq : params.lora_r;
1577
+ uint32_t n_rank_wk = params.custom_n_rank_wk ? params.n_rank_wk : params.lora_r;
1578
+ uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r;
1579
+ uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r;
1580
+ uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1;
1581
+ uint32_t n_rank_w1 = params.custom_n_rank_w1 ? params.n_rank_w1 : params.lora_r;
1582
+ uint32_t n_rank_w2 = params.custom_n_rank_w2 ? params.n_rank_w2 : params.lora_r;
1583
+ uint32_t n_rank_w3 = params.custom_n_rank_w3 ? params.n_rank_w3 : params.lora_r;
1584
+ uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
1585
+ uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1;
1586
+ uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r;
1587
+ lora.hparams.n_rank_attention_norm = n_rank_attention_norm;
1588
+ lora.hparams.n_rank_wq = n_rank_wq;
1589
+ lora.hparams.n_rank_wk = n_rank_wk;
1590
+ lora.hparams.n_rank_wv = n_rank_wv;
1591
+ lora.hparams.n_rank_wo = n_rank_wo;
1592
+ lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm;
1593
+ lora.hparams.n_rank_w1 = n_rank_w1;
1594
+ lora.hparams.n_rank_w2 = n_rank_w2;
1595
+ lora.hparams.n_rank_w3 = n_rank_w3;
1596
+ lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings;
1597
+ lora.hparams.n_rank_norm = n_rank_norm;
1598
+ lora.hparams.n_rank_output = n_rank_output;
1599
+
1600
+ // set opt params from command line
1601
+ opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
1602
+ opt->params.print_forward_graph = false;
1603
+ opt->params.print_backward_graph = false;
1604
+ opt->params.n_threads = params.common.n_threads;
1605
+ opt->params.past = params.common.opt_past;
1606
+ opt->params.delta = params.common.opt_delta;
1607
+ opt->params.max_no_improvement = params.common.opt_max_no_improvement;
1608
+ opt->params.n_gradient_accumulation = params.common.n_gradient_accumulation;
1609
+ opt->params.adam.n_iter = params.common.adam_n_iter;
1610
+ opt->params.adam.sched = 1.0f;
1611
+ opt->params.adam.alpha = params.common.adam_alpha;
1612
+ opt->params.adam.decay = params.common.adam_decay;
1613
+ opt->params.adam.decay_min_ndim = params.common.adam_decay_min_ndim;
1614
+ opt->params.adam.beta1 = params.common.adam_beta1;
1615
+ opt->params.adam.beta2 = params.common.adam_beta2;
1616
+ opt->params.adam.gclip = params.common.adam_gclip;
1617
+ opt->params.adam.eps_f = params.common.adam_eps_f;
1618
+
1619
+ ggml_allocr * alloc = NULL;
1620
+
1621
+ printf("%s: init model\n", __func__);
1622
+ bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);
1623
+
1624
+ if (existed) {
1625
+ // overwrite last n_ctx with user provided n_ctx
1626
+ if (params.common.custom_n_ctx) {
1627
+ model.hparams.n_ctx = params.common.n_ctx;
1628
+ }
1629
+
1630
+ const bool opt_param_count_changed = (
1631
+ (lora.hparams.n_rank_attention_norm != n_rank_attention_norm)
1632
+ || (lora.hparams.n_rank_wq != n_rank_wq)
1633
+ || (lora.hparams.n_rank_wk != n_rank_wk)
1634
+ || (lora.hparams.n_rank_wv != n_rank_wv)
1635
+ || (lora.hparams.n_rank_wo != n_rank_wo)
1636
+ || (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm)
1637
+ || (lora.hparams.n_rank_w1 != n_rank_w1)
1638
+ || (lora.hparams.n_rank_w2 != n_rank_w2)
1639
+ || (lora.hparams.n_rank_w3 != n_rank_w3)
1640
+ || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings)
1641
+ || (lora.hparams.n_rank_norm != n_rank_norm)
1642
+ || (lora.hparams.n_rank_output != n_rank_output)
1643
+ );
1644
+
1645
+ const bool opt_past_changed = opt->params.past != params.common.opt_past;
1646
+
1647
+ if (opt_param_count_changed) {
1648
+ print_lora_params(&lora.hparams);
1649
+ die("Provided rank differs from checkpoint file. To use different rank start finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting.");
1650
+ // need to discard previous optimizer gradient statistics and opt_init with new shapes
1651
+ // TODO
1652
+ }
1653
+ if (opt_past_changed) {
1654
+ die("Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting");
1655
+ // need to discard previous optimizer past function value statistics and opt_init with new shapes
1656
+ // TODO
1657
+ }
1658
+ } else { // existed == false
1659
+ init_lora(&model, &lora);
1660
+ randomize_lora(&lora, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f);
1661
+ if (!params.only_write_lora) {
1662
+ ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&lora));
1663
+ }
1664
+ }
1665
+ opt->iter = train->train_its;
1666
+
1667
+ print_params(&model.hparams);
1668
+ print_lora_params(&lora.hparams);
1669
+ printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) train->train_its);
1670
+ printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples);
1671
+ printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens);
1672
+ printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
1673
+ printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f));
1674
+
1675
+ if (params.only_write_lora) {
1676
+ save_train_files_data save_data;
1677
+ save_data.fn_checkpoint_out = "";
1678
+ save_data.fn_lora_out = params.fn_lora_out;
1679
+ save_data.pattern_fn_it = params.common.pattern_fn_it;
1680
+ save_data.fn_latest = params.common.fn_latest;
1681
+ save_data.model = &model;
1682
+ save_data.lora = &lora;
1683
+
1684
+ save_train_files(&save_data, train);
1685
+
1686
+ free_train_state(train);
1687
+ ggml_free(lora.ctx);
1688
+ llama_free(lctx);
1689
+ llama_free_model(lmodel);
1690
+ return 0;
1691
+ }
1692
+
1693
+ printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
1694
+ printf("%s: opt iter %d\n", __func__, opt->iter);
1695
+
1696
+ int n_tokens = model.hparams.n_ctx;
1697
+ int n_vocab = model.hparams.n_vocab;
1698
+ int n_batch = params.common.n_batch;
1699
+
1700
+
1701
+ std::vector<uint8_t> mem_input_data;
1702
+ std::vector<uint8_t> mem_compute_data;
1703
+
1704
+ // context for input tensors without their data
1705
+ struct ggml_init_params ctx_input_params = {
1706
+ ggml_tensor_overhead() * 2, // mem_size
1707
+ NULL, // mem_buffer
1708
+ true, // no_alloc
1709
+ };
1710
+ struct ggml_context * ctx_input = ggml_init(ctx_input_params);
1711
+
1712
+ // the input tensors
1713
+ struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch);
1714
+ struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
1715
+
1716
+ // measure required memory for input tensors
1717
+ alloc = ggml_allocr_new_measure(tensor_alignment);
1718
+ ggml_allocr_alloc(alloc, tokens_input);
1719
+ ggml_allocr_alloc(alloc, target_probs);
1720
+ size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
1721
+ ggml_allocr_free(alloc);
1722
+ printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
1723
+
1724
+ // allocate input tensors
1725
+ mem_input_data.resize(max_input_size);
1726
+ alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
1727
+ ggml_allocr_alloc(alloc, tokens_input);
1728
+ ggml_allocr_alloc(alloc, target_probs);
1729
+ ggml_allocr_free(alloc);
1730
+
1731
+ // context for compute tensors without their data
1732
+ size_t estimated_compute_size_wo_data = (
1733
+ ggml_tensor_overhead()*GGML_MAX_NODES*2
1734
+ + (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*(
1735
+ params.common.use_checkpointing ? 3 : 2
1736
+ )
1737
+ );
1738
+ struct ggml_init_params ctx_compute_params = {
1739
+ estimated_compute_size_wo_data, // mem_size
1740
+ NULL, // mem_buffer
1741
+ true, // no_alloc
1742
+ };
1743
+ struct ggml_context * ctx_compute = NULL;
1744
+
1745
+ struct ggml_tensor * loss = NULL;
1746
+ struct ggml_tensor * logits = NULL;
1747
+
1748
+ struct ggml_cgraph * gf = NULL;
1749
+ struct ggml_cgraph * gb = NULL;
1750
+ struct ggml_cgraph * gb_tmp = NULL;
1751
+
1752
+ // measure required memory for compute tensors
1753
+ size_t best_compute_size = SIZE_MAX;
1754
+ enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT;
1755
+ // find best evaluation order
1756
+ for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
1757
+ ctx_compute = ggml_init(ctx_compute_params);
1758
+ alloc = ggml_allocr_new_measure(tensor_alignment);
1759
+ gf = ggml_new_graph(ctx_compute);
1760
+ gf->order = (enum ggml_cgraph_eval_order) order;
1761
+ gb = ggml_new_graph(ctx_compute);
1762
+ gb_tmp = params.common.use_checkpointing
1763
+ ? ggml_new_graph(ctx_compute)
1764
+ : NULL;
1765
+ loss = llama_build_lora_finetune_graphs(
1766
+ &model, &lora, alloc, ctx_compute,
1767
+ gf, gb, gb_tmp,
1768
+ &logits, tokens_input, target_probs,
1769
+ n_tokens, n_batch,
1770
+ params.common.use_flash,
1771
+ params.common.use_checkpointing
1772
+ );
1773
+ size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment;
1774
+ if (max_compute_size < best_compute_size) {
1775
+ best_compute_size = max_compute_size;
1776
+ best_order = gf->order;
1777
+ }
1778
+ ggml_allocr_free(alloc);
1779
+ ggml_free(ctx_compute);
1780
+ }
1781
+ size_t max_compute_size = best_compute_size;
1782
+ printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
1783
+ printf("%s: evaluation order = %s\n", __func__,
1784
+ (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" :
1785
+ (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" :
1786
+ "invalid");
1787
+
1788
+ // allocate compute tensors
1789
+ mem_compute_data.resize(max_compute_size);
1790
+ ctx_compute = ggml_init(ctx_compute_params);
1791
+ alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
1792
+ gf = ggml_new_graph(ctx_compute);
1793
+ gf->order = best_order;
1794
+ gb = ggml_new_graph(ctx_compute);
1795
+ gb_tmp = params.common.use_checkpointing
1796
+ ? ggml_new_graph(ctx_compute)
1797
+ : NULL;
1798
+ loss = llama_build_lora_finetune_graphs(
1799
+ &model, &lora, alloc, ctx_compute,
1800
+ gf, gb, gb_tmp,
1801
+ &logits, tokens_input, target_probs,
1802
+ n_tokens, n_batch,
1803
+ params.common.use_flash,
1804
+ params.common.use_checkpointing
1805
+ );
1806
+ ggml_allocr_free(alloc);
1807
+
1808
+ // tokenize data
1809
+ std::vector<llama_token> train_tokens;
1810
+ std::vector<size_t> train_samples_begin;
1811
+ std::vector<size_t> train_samples_size;
1812
+ printf("%s: tokenize training data\n", __func__);
1813
+ tokenize_file(lctx,
1814
+ params.common.fn_train_data,
1815
+ params.common.sample_start,
1816
+ params.common.include_sample_start,
1817
+ params.common.overlapping_samples,
1818
+ n_tokens,
1819
+ train_tokens,
1820
+ train_samples_begin,
1821
+ train_samples_size);
1822
+ GGML_ASSERT(train_samples_begin.size() == train_samples_size.size());
1823
+
1824
+ printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size());
1825
+
1826
+ std::vector<size_t> token_noccurs;
1827
+ token_noccurs.resize(model.hparams.n_vocab, 0);
1828
+ for (unsigned int i = 0; i < train_tokens.size(); ++i) {
1829
+ ++token_noccurs[train_tokens[i]];
1830
+ }
1831
+ int n_unique_tokens = 0;
1832
+ for (unsigned int i = 0; i < token_noccurs.size(); ++i) {
1833
+ if (token_noccurs[i] == 0) continue;
1834
+ ++n_unique_tokens;
1835
+ }
1836
+ printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens);
1837
+
1838
+ size_t shuffle_samples_hash = compute_samples_hash(params.common.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size());
1839
+ const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size());
1840
+ if (changed_train_data) {
1841
+ printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__);
1842
+ }
1843
+ if (params.common.force_reshuffle) {
1844
+ printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__);
1845
+ }
1846
+ if ((train->shuffle_rng_state_current == "") || changed_train_data || params.common.force_reshuffle) {
1847
+ train->shuffle_rng_state_current = mt19937_seed_to_state(params.common.seed);
1848
+ train->shuffle_sample_count = train_samples_size.size();
1849
+ train->shuffle_next_sample = 0;
1850
+ train->shuffle_samples_hash = shuffle_samples_hash;
1851
+ }
1852
+ std::vector<size_t> train_shuffled_samples_offs;
1853
+ std::vector<size_t> train_shuffled_samples_begin;
1854
+ std::vector<size_t> train_shuffled_samples_size;
1855
+ train_shuffled_samples_offs.resize(train_samples_begin.size());
1856
+ train_shuffled_samples_begin.resize(train_samples_begin.size());
1857
+ train_shuffled_samples_size.resize(train_samples_size.size());
1858
+ train->shuffle_rng_state_next = shuffle_samples(
1859
+ train->shuffle_rng_state_current,
1860
+ train_shuffled_samples_offs.data(),
1861
+ train_shuffled_samples_begin.data(),
1862
+ train_shuffled_samples_size.data(),
1863
+ train_samples_begin.data(),
1864
+ train_samples_size.data(),
1865
+ train_samples_size.size());
1866
+
1867
+ printf("%s: begin training\n", __func__);
1868
+
1869
+ save_train_files_data save_data;
1870
+ save_data.fn_checkpoint_out = params.common.fn_checkpoint_out;
1871
+ save_data.fn_lora_out = params.fn_lora_out;
1872
+ save_data.pattern_fn_it = params.common.pattern_fn_it;
1873
+ save_data.fn_latest = params.common.fn_latest;
1874
+ save_data.model = &model;
1875
+ save_data.lora = &lora;
1876
+
1877
+ struct train_opt_callback_data opt_cb_data;
1878
+ opt_cb_data.params = &params.common;
1879
+ opt_cb_data.train = train;
1880
+ opt_cb_data.save_cb = &save_train_files;
1881
+ opt_cb_data.save_data = &save_data;
1882
+ opt_cb_data.lctx = lctx;
1883
+ opt_cb_data.last_save_iter = opt->iter;
1884
+ opt_cb_data.tokens_data = train_tokens.data();
1885
+ opt_cb_data.tokens_size = train_tokens.size();
1886
+ opt_cb_data.samples_begin = train_samples_begin.data();
1887
+ opt_cb_data.samples_size = train_samples_size.data();
1888
+ opt_cb_data.shuffled_samples_offs = train_shuffled_samples_offs.data();
1889
+ opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data();
1890
+ opt_cb_data.shuffled_samples_size = train_shuffled_samples_size.data();
1891
+ opt_cb_data.samples_count = train_samples_size.size();
1892
+ opt_cb_data.tokens_input = tokens_input;
1893
+ opt_cb_data.target_probs = target_probs;
1894
+ opt_cb_data.first_iter = opt->iter;
1895
+ opt_cb_data.first_epoch = train->train_epochs;
1896
+ opt_cb_data.iter_at_last_epoch = -1;
1897
+ opt_cb_data.last_time = ggml_time_ms();
1898
+ opt_cb_data.millis_per_iter = 0.0;
1899
+
1900
+ // measure required memory for work buffer
1901
+ size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE;
1902
+ printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
1903
+
1904
+ // context for work buffer
1905
+ struct ggml_init_params ctx_work_params = {
1906
+ max_work_size, // mem_size
1907
+ NULL, // mem_buffer
1908
+ false, // no_alloc
1909
+ };
1910
+ struct ggml_context * ctx_work = ggml_init(ctx_work_params);
1911
+
1912
+ int64_t t0 = ggml_time_ms();
1913
+
1914
+ ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data);
1915
+
1916
+ ggml_free(ctx_work);
1917
+ ggml_free(ctx_compute);
1918
+ ggml_free(ctx_input);
1919
+
1920
+ int64_t t1 = ggml_time_ms();
1921
+ printf("%s: total training time: ", __func__);
1922
+ print_duration((double) (t1 - t0));
1923
+ printf("\n");
1924
+
1925
+ int new_iters = opt->iter - opt_cb_data.last_save_iter;
1926
+ if (new_iters > 0) {
1927
+ train->train_its += new_iters;
1928
+ train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
1929
+
1930
+ save_train_files(&save_data, train);
1931
+ opt_cb_data.last_save_iter = opt->iter;
1932
+ }
1933
+
1934
+ ggml_free(opt->ctx);
1935
+ free_train_state(train);
1936
+ ggml_free(lora.ctx);
1937
+ llama_free(lctx);
1938
+ llama_free_model(lmodel);
1939
+ return 0;
1940
+ }
examples/gptneox-wip/falcon-main.cpp CHANGED
@@ -367,10 +367,10 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
367
  keyidx = gguf_find_key(ggufctx, "general.architecture");
368
  if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
369
  keyidx = gguf_find_key(ggufctx, "general.file_type");
370
- if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
371
  keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
372
  if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
373
- keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
374
  if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
375
  }
376
 
 
367
  keyidx = gguf_find_key(ggufctx, "general.architecture");
368
  if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
369
  keyidx = gguf_find_key(ggufctx, "general.file_type");
370
+ if (keyidx != -1) { printf("%s: model file type = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); }
371
  keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
372
  if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
373
+ keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository");
374
  if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
375
  }
376
 
examples/gptneox-wip/gptneox-main.cpp CHANGED
@@ -380,10 +380,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
380
  keyidx = gguf_find_key(ggufctx, "general.architecture");
381
  if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
382
  keyidx = gguf_find_key(ggufctx, "general.file_type");
383
- if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
384
  keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
385
  if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
386
- keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
387
  if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
388
  }
389
 
 
380
  keyidx = gguf_find_key(ggufctx, "general.architecture");
381
  if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
382
  keyidx = gguf_find_key(ggufctx, "general.file_type");
383
+ if (keyidx != -1) { printf("%s: model file type = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); }
384
  keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
385
  if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
386
+ keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository");
387
  if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
388
  }
389
 
examples/llama-bench/README.md ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # llama.cpp/example/llama-bench
2
+
3
+ Performance testing tool for llama.cpp.
4
+
5
+ ## Table of contents
6
+
7
+ 1. [Syntax](#syntax)
8
+ 2. [Examples](#examples)
9
+ 1. [Text generation with different models](#text-generation-with-different-models)
10
+ 2. [Prompt processing with different batch sizes](#prompt-processing-with-different-batch-sizes)
11
+ 3. [Different numbers of threads](#different-numbers-of-threads)
12
+ 4. [Different numbers of layers offloaded to the GPU](#different-numbers-of-layers-offloaded-to-the-gpu)
13
+ 3. [Output formats](#output-formats)
14
+ 1. [Markdown](#markdown)
15
+ 2. [CSV](#csv)
16
+ 3. [JSON](#json)
17
+ 4. [SQL](#sql)
18
+
19
+ ## Syntax
20
+
21
+ ```
22
+ usage: ./llama-bench [options]
23
+
24
+ options:
25
+ -h, --help
26
+ -m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
27
+ -p, --n-prompt <n> (default: 512)
28
+ -n, --n-gen <n> (default: 128)
29
+ -b, --batch-size <n> (default: 512)
30
+ --memory-f32 <0|1> (default: 0)
31
+ -t, --threads <n> (default: 16)
32
+ -ngl N, --n-gpu-layers <n> (default: 99)
33
+ -mg i, --main-gpu <i> (default: 0)
34
+ -mmq, --mul-mat-q <0|1> (default: 1)
35
+ -ts, --tensor_split <ts0/ts1/..>
36
+ -r, --repetitions <n> (default: 5)
37
+ -o, --output <csv|json|md|sql> (default: md)
38
+ -v, --verbose (default: 0)
39
+
40
+ Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
41
+ ```
42
+
43
+ llama-bench can perform two types of tests:
44
+
45
+ - Prompt processing (pp): processing a prompt in batches (`-p`)
46
+ - Text generation (tg): generating a sequence of tokens (`-n`)
47
+
48
+ With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).
49
+
50
+ Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.
51
+
52
+ For a description of the other options, see the [main example](../main/README.md).
53
+
54
+ ## Examples
55
+
56
+ ### Text generation with different models
57
+
58
+ ```sh
59
+ $ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512
60
+ ```
61
+
62
+ | model | size | params | backend | ngl | test | t/s |
63
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
64
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 132.19 ± 0.55 |
65
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 256 | 129.37 ± 0.54 |
66
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 512 | 123.83 ± 0.25 |
67
+ | llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 128 | 82.17 ± 0.31 |
68
+ | llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 256 | 80.74 ± 0.23 |
69
+ | llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 512 | 78.08 ± 0.07 |
70
+
71
+ ### Prompt processing with different batch sizes
72
+
73
+ ```sh
74
+ $ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024
75
+ ```
76
+
77
+ | model | size | params | backend | ngl | n_batch | test | t/s |
78
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: |
79
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 128 | pp 1024 | 1436.51 ± 3.66 |
80
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 256 | pp 1024 | 1932.43 ± 23.48 |
81
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 512 | pp 1024 | 2254.45 ± 15.59 |
82
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 1024 | pp 1024 | 2498.61 ± 13.58 |
83
+
84
+ ### Different numbers of threads
85
+
86
+ ```sh
87
+ $ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32
88
+ ```
89
+
90
+ | model | size | params | backend | threads | test | t/s |
91
+ | ------------------------------ | ---------: | ---------: | ---------- | ---------: | ---------- | ---------------: |
92
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | pp 64 | 6.17 ± 0.07 |
93
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | tg 16 | 4.05 ± 0.02 |
94
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | pp 64 | 12.31 ± 0.13 |
95
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | tg 16 | 7.80 ± 0.07 |
96
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | pp 64 | 23.18 ± 0.06 |
97
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | tg 16 | 12.22 ± 0.07 |
98
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | pp 64 | 32.29 ± 1.21 |
99
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | tg 16 | 16.71 ± 0.66 |
100
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | pp 64 | 33.52 ± 0.03 |
101
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | tg 16 | 15.32 ± 0.05 |
102
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | pp 64 | 59.00 ± 1.11 |
103
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | tg 16 | 16.41 ± 0.79 ||
104
+
105
+ ### Different numbers of layers offloaded to the GPU
106
+
107
+ ```sh
108
+ $ ./llama-bench -ngl 10,20,30,31,32,33,34,35
109
+ ```
110
+
111
+ | model | size | params | backend | ngl | test | t/s |
112
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
113
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | pp 512 | 373.36 ± 2.25 |
114
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | tg 128 | 13.45 ± 0.93 |
115
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | pp 512 | 472.65 ± 1.25 |
116
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | tg 128 | 21.36 ± 1.94 |
117
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | pp 512 | 631.87 ± 11.25 |
118
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | tg 128 | 40.04 ± 1.82 |
119
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | pp 512 | 657.89 ± 5.08 |
120
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | tg 128 | 48.19 ± 0.81 |
121
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | pp 512 | 688.26 ± 3.29 |
122
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | tg 128 | 54.78 ± 0.65 |
123
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | pp 512 | 704.27 ± 2.24 |
124
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | tg 128 | 60.62 ± 1.76 |
125
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | pp 512 | 881.34 ± 5.40 |
126
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | tg 128 | 71.76 ± 0.23 |
127
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 |
128
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 |
129
+
130
+ ## Output formats
131
+
132
+ By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
133
+
134
+ ### Markdown
135
+
136
+ ```sh
137
+ $ ./llama-bench -o md
138
+ ```
139
+
140
+ | model | size | params | backend | ngl | test | t/s |
141
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
142
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | pp 512 | 2368.80 ± 93.24 |
143
+ | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 131.42 ± 0.59 |
144
+
145
+ ### CSV
146
+
147
+ ```sh
148
+ $ ./llama-bench -o csv
149
+ ```
150
+
151
+ ```csv
152
+ build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
153
+ "3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
154
+ "3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
155
+ ```
156
+
157
+ ### JSON
158
+
159
+ ```sh
160
+ $ ./llama-bench -o json
161
+ ```
162
+
163
+ ```json
164
+ [
165
+ {
166
+ "build_commit": "3469684",
167
+ "build_number": 1275,
168
+ "cuda": true,
169
+ "opencl": false,
170
+ "metal": false,
171
+ "gpu_blas": true,
172
+ "blas": true,
173
+ "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
174
+ "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
175
+ "model_filename": "models/7B/ggml-model-q4_0.gguf",
176
+ "model_type": "llama 7B mostly Q4_0",
177
+ "model_size": 3825065984,
178
+ "model_n_params": 6738415616,
179
+ "n_batch": 512,
180
+ "n_threads": 16,
181
+ "f16_kv": true,
182
+ "n_gpu_layers": 99,
183
+ "main_gpu": 0,
184
+ "mul_mat_q": true,
185
+ "tensor_split": "0.00",
186
+ "n_prompt": 512,
187
+ "n_gen": 0,
188
+ "test_time": "2023-09-23T12:09:57Z",
189
+ "avg_ns": 212365953,
190
+ "stddev_ns": 985423,
191
+ "avg_ts": 2410.974041,
192
+ "stddev_ts": 11.163766,
193
+ "samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ],
194
+ "samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ]
195
+ },
196
+ {
197
+ "build_commit": "3469684",
198
+ "build_number": 1275,
199
+ "cuda": true,
200
+ "opencl": false,
201
+ "metal": false,
202
+ "gpu_blas": true,
203
+ "blas": true,
204
+ "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
205
+ "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
206
+ "model_filename": "models/7B/ggml-model-q4_0.gguf",
207
+ "model_type": "llama 7B mostly Q4_0",
208
+ "model_size": 3825065984,
209
+ "model_n_params": 6738415616,
210
+ "n_batch": 512,
211
+ "n_threads": 16,
212
+ "f16_kv": true,
213
+ "n_gpu_layers": 99,
214
+ "main_gpu": 0,
215
+ "mul_mat_q": true,
216
+ "tensor_split": "0.00",
217
+ "n_prompt": 0,
218
+ "n_gen": 128,
219
+ "test_time": "2023-09-23T12:09:59Z",
220
+ "avg_ns": 977425219,
221
+ "stddev_ns": 9268593,
222
+ "avg_ts": 130.965708,
223
+ "stddev_ts": 1.238924,
224
+ "samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ],
225
+ "samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ]
226
+ }
227
+ ]
228
+ ```
229
+
230
+ ### SQL
231
+
232
+ SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.
233
+
234
+ ```sh
235
+ $ ./llama-bench -o sql
236
+ ```
237
+
238
+ ```sql
239
+ CREATE TABLE IF NOT EXISTS test (
240
+ build_commit TEXT,
241
+ build_number INTEGER,
242
+ cuda INTEGER,
243
+ opencl INTEGER,
244
+ metal INTEGER,
245
+ gpu_blas INTEGER,
246
+ blas INTEGER,
247
+ cpu_info TEXT,
248
+ gpu_info TEXT,
249
+ model_filename TEXT,
250
+ model_type TEXT,
251
+ model_size INTEGER,
252
+ model_n_params INTEGER,
253
+ n_batch INTEGER,
254
+ n_threads INTEGER,
255
+ f16_kv INTEGER,
256
+ n_gpu_layers INTEGER,
257
+ main_gpu INTEGER,
258
+ mul_mat_q INTEGER,
259
+ tensor_split TEXT,
260
+ n_prompt INTEGER,
261
+ n_gen INTEGER,
262
+ test_time TEXT,
263
+ avg_ns INTEGER,
264
+ stddev_ns INTEGER,
265
+ avg_ts REAL,
266
+ stddev_ts REAL
267
+ );
268
+
269
+ INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
270
+ INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
271
+ ```
examples/llama-bench/llama-bench.cpp CHANGED
@@ -132,7 +132,6 @@ struct cmd_params {
132
  std::vector<int> n_gpu_layers;
133
  std::vector<int> main_gpu;
134
  std::vector<bool> mul_mat_q;
135
- std::vector<bool> low_vram;
136
  std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
137
  int reps;
138
  bool verbose;
@@ -149,7 +148,6 @@ static const cmd_params cmd_params_defaults = {
149
  /* n_gpu_layers */ {99},
150
  /* main_gpu */ {0},
151
  /* mul_mat_q */ {true},
152
- /* low_vram */ {false},
153
  /* tensor_split */ {{}},
154
  /* reps */ 5,
155
  /* verbose */ false,
@@ -167,9 +165,8 @@ static void print_usage(int /* argc */, char ** argv) {
167
  printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
168
  printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
169
  printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
170
- printf(" -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
171
- printf(" -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
172
- printf(" -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
173
  printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
174
  printf(" -ts, --tensor_split <ts0/ts1/..> \n");
175
  printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
@@ -255,13 +252,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
255
  break;
256
  }
257
  params.main_gpu = split<int>(argv[i], split_delim);
258
- } else if (arg == "-lv" || arg == "--low-vram") {
259
- if (++i >= argc) {
260
- invalid_param = true;
261
- break;
262
- }
263
- auto p = split<bool>(argv[i], split_delim);
264
- params.low_vram.insert(params.low_vram.end(), p.begin(), p.end());
265
  } else if (arg == "-mmq" || arg == "--mul-mat-q") {
266
  if (++i >= argc) {
267
  invalid_param = true;
@@ -336,7 +326,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
336
  if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
337
  if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
338
  if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
339
- if (params.low_vram.empty()) { params.low_vram = cmd_params_defaults.low_vram; }
340
  if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
341
  if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
342
 
@@ -353,21 +342,34 @@ struct cmd_params_instance {
353
  int n_gpu_layers;
354
  int main_gpu;
355
  bool mul_mat_q;
356
- bool low_vram;
357
  std::array<float, LLAMA_MAX_DEVICES> tensor_split;
358
 
359
- llama_context_params to_llama_params() const {
360
- llama_context_params lparams = llama_context_default_params();
361
- lparams.n_ctx = n_prompt + n_gen;
362
- lparams.n_batch = n_batch;
363
- lparams.f16_kv = !f32_kv;
364
- lparams.n_gpu_layers = n_gpu_layers;
365
- lparams.main_gpu = main_gpu;
366
- lparams.mul_mat_q = mul_mat_q;
367
- lparams.low_vram = low_vram;
368
- lparams.tensor_split = tensor_split.data();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
 
370
- return lparams;
371
  }
372
  };
373
 
@@ -375,13 +377,12 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
375
  std::vector<cmd_params_instance> instances;
376
 
377
  for (const auto & m : params.model)
378
- for (const auto & nb : params.n_batch)
379
- for (const auto & fk : params.f32_kv)
380
  for (const auto & nl : params.n_gpu_layers)
381
  for (const auto & mg : params.main_gpu)
382
- for (const auto & mmq : params.mul_mat_q)
383
- for (const auto & lv : params.low_vram)
384
  for (const auto & ts : params.tensor_split)
 
 
 
385
  for (const auto & nt : params.n_threads) {
386
  cmd_params_instance instance = {
387
  /* .model = */ m,
@@ -393,7 +394,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
393
  /* .n_gpu_layers = */ nl,
394
  /* .main_gpu = */ mg,
395
  /* .mul_mat_q = */ mmq,
396
- /* .low_vram = */ lv,
397
  /* .tensor_split = */ ts,
398
  };
399
  instances.push_back(instance);
@@ -404,6 +404,56 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
404
  static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
405
  std::vector<cmd_params_instance> instances;
406
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
  for (const auto & n_prompt : params.n_prompt) {
408
  if (n_prompt == 0) {
409
  continue;
@@ -419,6 +469,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
419
  auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
420
  instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
421
  }
 
422
 
423
  return instances;
424
  }
@@ -443,7 +494,6 @@ struct test {
443
  int n_gpu_layers;
444
  int main_gpu;
445
  bool mul_mat_q;
446
- bool low_vram;
447
  std::array<float, LLAMA_MAX_DEVICES> tensor_split;
448
  int n_prompt;
449
  int n_gen;
@@ -463,7 +513,6 @@ struct test {
463
  n_gpu_layers = inst.n_gpu_layers;
464
  main_gpu = inst.main_gpu;
465
  mul_mat_q = inst.mul_mat_q;
466
- low_vram = inst.low_vram;
467
  tensor_split = inst.tensor_split;
468
  n_prompt = inst.n_prompt;
469
  n_gen = inst.n_gen;
@@ -524,7 +573,7 @@ struct test {
524
  "cpu_info", "gpu_info",
525
  "model_filename", "model_type", "model_size", "model_n_params",
526
  "n_batch", "n_threads", "f16_kv",
527
- "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
528
  "n_prompt", "n_gen", "test_time",
529
  "avg_ns", "stddev_ns",
530
  "avg_ts", "stddev_ts"
@@ -543,7 +592,7 @@ struct test {
543
  return INT;
544
  }
545
  if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
546
- field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") {
547
  return BOOL;
548
  }
549
  if (field == "avg_ts" || field == "stddev_ts") {
@@ -574,7 +623,7 @@ struct test {
574
  cpu_info, gpu_info,
575
  model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
576
  std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
577
- std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
578
  std::to_string(n_prompt), std::to_string(n_gen), test_time,
579
  std::to_string(avg_ns()), std::to_string(stdev_ns()),
580
  std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -606,9 +655,9 @@ struct printer {
606
  virtual ~printer() {}
607
 
608
  FILE * fout;
609
- virtual void print_header(const cmd_params & params) { (void) params; };
610
  virtual void print_test(const test & t) = 0;
611
- virtual void print_footer() { };
612
  };
613
 
614
  struct csv_printer : public printer {
@@ -766,9 +815,6 @@ struct markdown_printer : public printer {
766
  if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
767
  fields.push_back("mul_mat_q");
768
  }
769
- if (params.low_vram.size() > 1 || params.low_vram != cmd_params_defaults.low_vram) {
770
- fields.push_back("low_vram");
771
- }
772
  if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
773
  fields.push_back("tensor_split");
774
  }
@@ -889,21 +935,27 @@ struct sql_printer : public printer {
889
  static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
890
  std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
891
  int n_processed = 0;
 
 
 
892
  while (n_processed < n_prompt) {
893
  int n_tokens = std::min(n_prompt - n_processed, n_batch);
894
- llama_eval(ctx, tokens.data(), n_tokens, n_past + n_processed, n_threads);
895
  n_processed += n_tokens;
896
  }
897
  }
898
 
899
  static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
900
  llama_token token = llama_token_bos(ctx);
 
 
 
901
  for (int i = 0; i < n_gen; i++) {
902
- llama_eval(ctx, &token, 1, n_past + i, n_threads);
903
  }
904
  }
905
 
906
- static void llama_null_log_callback(enum llama_log_level level, const char * text, void * user_data) {
907
  (void) level;
908
  (void) text;
909
  (void) user_data;
@@ -958,17 +1010,25 @@ int main(int argc, char ** argv) {
958
 
959
  std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
960
 
 
 
 
961
  for (const auto & inst : params_instances) {
962
- // TODO: keep the model between tests when possible
963
- llama_context_params lparams = inst.to_llama_params();
 
 
 
964
 
965
- llama_model * lmodel = llama_load_model_from_file(inst.model.c_str(), lparams);
966
- if (lmodel == NULL) {
967
- fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
968
- return 1;
 
 
969
  }
970
 
971
- llama_context * ctx = llama_new_context_with_model(lmodel, lparams);
972
  if (ctx == NULL) {
973
  fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
974
  llama_free_model(lmodel);
@@ -977,6 +1037,8 @@ int main(int argc, char ** argv) {
977
 
978
  test t(inst, lmodel, ctx);
979
 
 
 
980
  // warmup run
981
  if (t.n_prompt > 0) {
982
  test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads);
@@ -986,6 +1048,8 @@ int main(int argc, char ** argv) {
986
  }
987
 
988
  for (int i = 0; i < params.reps; i++) {
 
 
989
  uint64_t t_start = get_time_ns();
990
  if (t.n_prompt > 0) {
991
  test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
@@ -1002,9 +1066,10 @@ int main(int argc, char ** argv) {
1002
  llama_print_timings(ctx);
1003
 
1004
  llama_free(ctx);
1005
- llama_free_model(lmodel);
1006
  }
1007
 
 
 
1008
  p->print_footer();
1009
 
1010
  llama_backend_free();
 
132
  std::vector<int> n_gpu_layers;
133
  std::vector<int> main_gpu;
134
  std::vector<bool> mul_mat_q;
 
135
  std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
136
  int reps;
137
  bool verbose;
 
148
  /* n_gpu_layers */ {99},
149
  /* main_gpu */ {0},
150
  /* mul_mat_q */ {true},
 
151
  /* tensor_split */ {{}},
152
  /* reps */ 5,
153
  /* verbose */ false,
 
165
  printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
166
  printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
167
  printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
168
+ printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
169
+ printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
 
170
  printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
171
  printf(" -ts, --tensor_split <ts0/ts1/..> \n");
172
  printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
 
252
  break;
253
  }
254
  params.main_gpu = split<int>(argv[i], split_delim);
 
 
 
 
 
 
 
255
  } else if (arg == "-mmq" || arg == "--mul-mat-q") {
256
  if (++i >= argc) {
257
  invalid_param = true;
 
326
  if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
327
  if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
328
  if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
 
329
  if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
330
  if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
331
 
 
342
  int n_gpu_layers;
343
  int main_gpu;
344
  bool mul_mat_q;
 
345
  std::array<float, LLAMA_MAX_DEVICES> tensor_split;
346
 
347
+ llama_model_params to_llama_mparams() const {
348
+ llama_model_params mparams = llama_model_default_params();
349
+
350
+ mparams.n_gpu_layers = n_gpu_layers;
351
+ mparams.main_gpu = main_gpu;
352
+ mparams.tensor_split = tensor_split.data();
353
+
354
+ return mparams;
355
+ }
356
+
357
+ bool equal_mparams(const cmd_params_instance & other) const {
358
+ return model == other.model &&
359
+ n_gpu_layers == other.n_gpu_layers &&
360
+ main_gpu == other.main_gpu &&
361
+ tensor_split == other.tensor_split;
362
+ }
363
+
364
+ llama_context_params to_llama_cparams() const {
365
+ llama_context_params cparams = llama_context_default_params();
366
+
367
+ cparams.n_ctx = n_prompt + n_gen;
368
+ cparams.n_batch = n_batch;
369
+ cparams.f16_kv = !f32_kv;
370
+ cparams.mul_mat_q = mul_mat_q;
371
 
372
+ return cparams;
373
  }
374
  };
375
 
 
377
  std::vector<cmd_params_instance> instances;
378
 
379
  for (const auto & m : params.model)
 
 
380
  for (const auto & nl : params.n_gpu_layers)
381
  for (const auto & mg : params.main_gpu)
 
 
382
  for (const auto & ts : params.tensor_split)
383
+ for (const auto & nb : params.n_batch)
384
+ for (const auto & fk : params.f32_kv)
385
+ for (const auto & mmq : params.mul_mat_q)
386
  for (const auto & nt : params.n_threads) {
387
  cmd_params_instance instance = {
388
  /* .model = */ m,
 
394
  /* .n_gpu_layers = */ nl,
395
  /* .main_gpu = */ mg,
396
  /* .mul_mat_q = */ mmq,
 
397
  /* .tensor_split = */ ts,
398
  };
399
  instances.push_back(instance);
 
404
  static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
405
  std::vector<cmd_params_instance> instances;
406
 
407
+ #if 1
408
+ // this ordering minimizes the number of times that each model needs to be reloaded
409
+ for (const auto & m : params.model)
410
+ for (const auto & nl : params.n_gpu_layers)
411
+ for (const auto & mg : params.main_gpu)
412
+ for (const auto & ts : params.tensor_split)
413
+ for (const auto & nb : params.n_batch)
414
+ for (const auto & fk : params.f32_kv)
415
+ for (const auto & mmq : params.mul_mat_q)
416
+ for (const auto & nt : params.n_threads) {
417
+ for (const auto & n_prompt : params.n_prompt) {
418
+ if (n_prompt == 0) {
419
+ continue;
420
+ }
421
+ cmd_params_instance instance = {
422
+ /* .model = */ m,
423
+ /* .n_prompt = */ n_prompt,
424
+ /* .n_gen = */ 0,
425
+ /* .n_batch = */ nb,
426
+ /* .f32_kv = */ fk,
427
+ /* .n_threads = */ nt,
428
+ /* .n_gpu_layers = */ nl,
429
+ /* .main_gpu = */ mg,
430
+ /* .mul_mat_q = */ mmq,
431
+ /* .tensor_split = */ ts,
432
+ };
433
+ instances.push_back(instance);
434
+ }
435
+
436
+ for (const auto & n_gen : params.n_gen) {
437
+ if (n_gen == 0) {
438
+ continue;
439
+ }
440
+ cmd_params_instance instance = {
441
+ /* .model = */ m,
442
+ /* .n_prompt = */ 0,
443
+ /* .n_gen = */ n_gen,
444
+ /* .n_batch = */ nb,
445
+ /* .f32_kv = */ fk,
446
+ /* .n_threads = */ nt,
447
+ /* .n_gpu_layers = */ nl,
448
+ /* .main_gpu = */ mg,
449
+ /* .mul_mat_q = */ mmq,
450
+ /* .tensor_split = */ ts,
451
+ };
452
+ instances.push_back(instance);
453
+ }
454
+ }
455
+ #else
456
+ // this ordering separates the prompt and generation tests
457
  for (const auto & n_prompt : params.n_prompt) {
458
  if (n_prompt == 0) {
459
  continue;
 
469
  auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
470
  instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
471
  }
472
+ #endif
473
 
474
  return instances;
475
  }
 
494
  int n_gpu_layers;
495
  int main_gpu;
496
  bool mul_mat_q;
 
497
  std::array<float, LLAMA_MAX_DEVICES> tensor_split;
498
  int n_prompt;
499
  int n_gen;
 
513
  n_gpu_layers = inst.n_gpu_layers;
514
  main_gpu = inst.main_gpu;
515
  mul_mat_q = inst.mul_mat_q;
 
516
  tensor_split = inst.tensor_split;
517
  n_prompt = inst.n_prompt;
518
  n_gen = inst.n_gen;
 
573
  "cpu_info", "gpu_info",
574
  "model_filename", "model_type", "model_size", "model_n_params",
575
  "n_batch", "n_threads", "f16_kv",
576
+ "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
577
  "n_prompt", "n_gen", "test_time",
578
  "avg_ns", "stddev_ns",
579
  "avg_ts", "stddev_ts"
 
592
  return INT;
593
  }
594
  if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
595
+ field == "f16_kv" || field == "mul_mat_q") {
596
  return BOOL;
597
  }
598
  if (field == "avg_ts" || field == "stddev_ts") {
 
623
  cpu_info, gpu_info,
624
  model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
625
  std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
626
+ std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
627
  std::to_string(n_prompt), std::to_string(n_gen), test_time,
628
  std::to_string(avg_ns()), std::to_string(stdev_ns()),
629
  std::to_string(avg_ts()), std::to_string(stdev_ts())
 
655
  virtual ~printer() {}
656
 
657
  FILE * fout;
658
+ virtual void print_header(const cmd_params & params) { (void) params; }
659
  virtual void print_test(const test & t) = 0;
660
+ virtual void print_footer() { }
661
  };
662
 
663
  struct csv_printer : public printer {
 
815
  if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
816
  fields.push_back("mul_mat_q");
817
  }
 
 
 
818
  if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
819
  fields.push_back("tensor_split");
820
  }
 
935
  static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
936
  std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
937
  int n_processed = 0;
938
+
939
+ llama_set_n_threads(ctx, n_threads, n_threads);
940
+
941
  while (n_processed < n_prompt) {
942
  int n_tokens = std::min(n_prompt - n_processed, n_batch);
943
+ llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
944
  n_processed += n_tokens;
945
  }
946
  }
947
 
948
  static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
949
  llama_token token = llama_token_bos(ctx);
950
+
951
+ llama_set_n_threads(ctx, n_threads, n_threads);
952
+
953
  for (int i = 0; i < n_gen; i++) {
954
+ llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
955
  }
956
  }
957
 
958
+ static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
959
  (void) level;
960
  (void) text;
961
  (void) user_data;
 
1010
 
1011
  std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
1012
 
1013
+ llama_model * lmodel = nullptr;
1014
+ const cmd_params_instance * prev_inst = nullptr;
1015
+
1016
  for (const auto & inst : params_instances) {
1017
+ // keep the same model between tests when possible
1018
+ if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
1019
+ if (lmodel) {
1020
+ llama_free_model(lmodel);
1021
+ }
1022
 
1023
+ lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
1024
+ if (lmodel == NULL) {
1025
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
1026
+ return 1;
1027
+ }
1028
+ prev_inst = &inst;
1029
  }
1030
 
1031
+ llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
1032
  if (ctx == NULL) {
1033
  fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
1034
  llama_free_model(lmodel);
 
1037
 
1038
  test t(inst, lmodel, ctx);
1039
 
1040
+ llama_kv_cache_tokens_rm(ctx, -1, -1);
1041
+
1042
  // warmup run
1043
  if (t.n_prompt > 0) {
1044
  test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads);
 
1048
  }
1049
 
1050
  for (int i = 0; i < params.reps; i++) {
1051
+ llama_kv_cache_tokens_rm(ctx, -1, -1);
1052
+
1053
  uint64_t t_start = get_time_ns();
1054
  if (t.n_prompt > 0) {
1055
  test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
 
1066
  llama_print_timings(ctx);
1067
 
1068
  llama_free(ctx);
 
1069
  }
1070
 
1071
+ llama_free_model(lmodel);
1072
+
1073
  p->print_footer();
1074
 
1075
  llama_backend_free();
examples/main/README.md CHANGED
@@ -262,7 +262,8 @@ These options help improve the performance and memory usage of the LLaMA models.
262
 
263
  ### Number of Threads
264
 
265
- - `-t N, --threads N`: Set the number of threads to use during computation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance.
 
266
 
267
  ### Mlock
268
 
@@ -305,6 +306,5 @@ These options provide extra functionality and customization when running the LLa
305
  - `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
306
  - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
307
  - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
308
- - `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
309
  - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
310
  - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
 
262
 
263
  ### Number of Threads
264
 
265
+ - `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance.
266
+ - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. In some systems, it is beneficial to use a higher number of threads during batch processing than during generation. If not specified, the number of threads used for batch processing will be the same as the number of threads used for generation.
267
 
268
  ### Mlock
269
 
 
306
  - `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
307
  - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
308
  - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
 
309
  - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
310
  - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
examples/main/main.cpp CHANGED
@@ -124,7 +124,7 @@ int main(int argc, char ** argv) {
124
  console::init(params.simple_io, params.use_color);
125
  atexit([]() { console::cleanup(); });
126
 
127
- if (params.perplexity) {
128
  printf("\n************\n");
129
  printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
130
  printf("************\n\n");
@@ -140,12 +140,17 @@ int main(int argc, char ** argv) {
140
  return 0;
141
  }
142
 
143
- if (params.rope_freq_base != 10000.0) {
144
- LOG_TEE("%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
 
 
 
 
 
145
  }
146
 
147
- if (params.rope_freq_scale != 1.0) {
148
- LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
149
  }
150
 
151
  LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
@@ -184,29 +189,19 @@ int main(int argc, char ** argv) {
184
  return 1;
185
  }
186
 
187
- const int n_ctx_train = llama_n_ctx_train(ctx);
188
- if (params.n_ctx > n_ctx_train) {
 
 
 
189
  LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
190
- __func__, n_ctx_train, params.n_ctx);
191
- } else if (params.n_ctx < 8) {
192
- LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
193
- params.n_ctx = 8;
194
  }
195
 
196
  // print system information
197
  {
198
  LOG_TEE("\n");
199
- LOG_TEE("system_info: n_threads = %d / %d | %s\n",
200
- params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
201
- }
202
-
203
- // export the cgraph and exit
204
- if (params.export_cgraph) {
205
- llama_eval_export(ctx, "llama.ggml");
206
- llama_free(ctx);
207
- llama_free_model(model);
208
-
209
- return 0;
210
  }
211
 
212
  std::string path_session = params.path_prompt_cache;
@@ -220,7 +215,7 @@ int main(int argc, char ** argv) {
220
  if (fp != NULL) {
221
  std::fclose(fp);
222
 
223
- session_tokens.resize(params.n_ctx);
224
  size_t n_token_count_out = 0;
225
  if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
226
  LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
@@ -235,7 +230,7 @@ int main(int argc, char ** argv) {
235
  }
236
  }
237
 
238
- const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
239
  LOG("add_bos: %d\n", add_bos);
240
 
241
  std::vector<llama_token> embd_inp;
@@ -276,9 +271,6 @@ int main(int argc, char ** argv) {
276
  LOG("guidance_offset: %s", log_tostr(guidance_offset));
277
  }
278
 
279
- const int n_ctx = llama_n_ctx(ctx);
280
- LOG("n_ctx: %d\n", n_ctx);
281
-
282
  if ((int) embd_inp.size() > n_ctx - 4) {
283
  LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
284
  return 1;
@@ -475,7 +467,7 @@ int main(int argc, char ** argv) {
475
  std::vector<llama_token> embd;
476
  std::vector<llama_token> embd_guidance;
477
 
478
- const int n_vocab = llama_n_vocab(ctx);
479
 
480
  std::vector<llama_token_data> candidates;
481
  candidates.reserve(n_vocab);
@@ -508,17 +500,22 @@ int main(int argc, char ** argv) {
508
  break;
509
  }
510
 
511
- const int n_left = n_past - params.n_keep;
512
- LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d\n", n_past, n_left, n_ctx, params.n_keep);
513
 
514
- // always keep the first token - BOS
515
- n_past = std::max(1, params.n_keep);
516
- n_past_guidance = std::max(1, params.n_keep + guidance_offset);
517
 
518
- LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
 
 
 
519
 
520
- // insert n_left/2 tokens at the start of embd from last_tokens
521
- embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size());
 
 
 
522
 
523
  LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
524
 
@@ -580,7 +577,7 @@ int main(int argc, char ** argv) {
580
 
581
  for (int i = 0; i < input_size; i += params.n_batch) {
582
  int n_eval = std::min(input_size - i, params.n_batch);
583
- if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
584
  LOG_TEE("%s : failed to eval\n", __func__);
585
  return 1;
586
  }
@@ -597,7 +594,7 @@ int main(int argc, char ** argv) {
597
 
598
  LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
599
 
600
- if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
601
  LOG_TEE("%s : failed to eval\n", __func__);
602
  return 1;
603
  }
@@ -855,7 +852,7 @@ int main(int argc, char ** argv) {
855
  llama_backend_free();
856
 
857
  #ifndef LOG_DISABLE_LOGS
858
- LOG_TEE("Log end\n")
859
  #endif // LOG_DISABLE_LOGS
860
 
861
  return 0;
 
124
  console::init(params.simple_io, params.use_color);
125
  atexit([]() { console::cleanup(); });
126
 
127
+ if (params.logits_all) {
128
  printf("\n************\n");
129
  printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
130
  printf("************\n\n");
 
140
  return 0;
141
  }
142
 
143
+ if (params.n_ctx != 0 && params.n_ctx < 8) {
144
+ LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
145
+ params.n_ctx = 8;
146
+ }
147
+
148
+ if (params.rope_freq_base != 0.0) {
149
+ LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
150
  }
151
 
152
+ if (params.rope_freq_scale != 0.0) {
153
+ LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
154
  }
155
 
156
  LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
 
189
  return 1;
190
  }
191
 
192
+ const int n_ctx_train = llama_n_ctx_train(model);
193
+ const int n_ctx = llama_n_ctx(ctx);
194
+ LOG("n_ctx: %d\n", n_ctx);
195
+
196
+ if (n_ctx > n_ctx_train) {
197
  LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
198
+ __func__, n_ctx_train, n_ctx);
 
 
 
199
  }
200
 
201
  // print system information
202
  {
203
  LOG_TEE("\n");
204
+ LOG_TEE("%s\n", get_system_info(params).c_str());
 
 
 
 
 
 
 
 
 
 
205
  }
206
 
207
  std::string path_session = params.path_prompt_cache;
 
215
  if (fp != NULL) {
216
  std::fclose(fp);
217
 
218
+ session_tokens.resize(n_ctx);
219
  size_t n_token_count_out = 0;
220
  if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
221
  LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
 
230
  }
231
  }
232
 
233
+ const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
234
  LOG("add_bos: %d\n", add_bos);
235
 
236
  std::vector<llama_token> embd_inp;
 
271
  LOG("guidance_offset: %s", log_tostr(guidance_offset));
272
  }
273
 
 
 
 
274
  if ((int) embd_inp.size() > n_ctx - 4) {
275
  LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
276
  return 1;
 
467
  std::vector<llama_token> embd;
468
  std::vector<llama_token> embd_guidance;
469
 
470
+ const int n_vocab = llama_n_vocab(model);
471
 
472
  std::vector<llama_token_data> candidates;
473
  candidates.reserve(n_vocab);
 
500
  break;
501
  }
502
 
503
+ const int n_left = n_past - params.n_keep - 1;
504
+ const int n_discard = n_left/2;
505
 
506
+ LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
507
+ n_past, n_left, n_ctx, params.n_keep, n_discard);
 
508
 
509
+ llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
510
+ llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
511
+
512
+ n_past -= n_discard;
513
 
514
+ if (ctx_guidance) {
515
+ n_past_guidance -= n_discard;
516
+ }
517
+
518
+ LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
519
 
520
  LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
521
 
 
577
 
578
  for (int i = 0; i < input_size; i += params.n_batch) {
579
  int n_eval = std::min(input_size - i, params.n_batch);
580
+ if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
581
  LOG_TEE("%s : failed to eval\n", __func__);
582
  return 1;
583
  }
 
594
 
595
  LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
596
 
597
+ if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
598
  LOG_TEE("%s : failed to eval\n", __func__);
599
  return 1;
600
  }
 
852
  llama_backend_free();
853
 
854
  #ifndef LOG_DISABLE_LOGS
855
+ LOG_TEE("Log end\n");
856
  #endif // LOG_DISABLE_LOGS
857
 
858
  return 0;
examples/make-ggml.py CHANGED
@@ -1,22 +1,25 @@
1
  #!/usr/bin/env python3
2
  """
3
- This script converts Hugging Face llama models to GGML and quantizes them.
4
 
5
  Usage:
6
- python make-ggml.py --model {model_dir_or_hf_repo_name} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
7
 
8
  Arguments:
9
- - --model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
 
10
  - --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
11
  - --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
12
  - --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
13
  - --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
14
 
15
- Quant types:
16
  - Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
17
  - Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
18
  - Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
19
  - Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
 
 
20
  - Q2_K: smallest, extreme quality loss - not recommended
21
  - Q3_K: alias for Q3_K_M
22
  - Q3_K_S: very small, very high quality loss
@@ -40,9 +43,7 @@ import argparse
40
  import os
41
  from huggingface_hub import snapshot_download
42
 
43
- def main(model, outname, outdir, quants, keep_fp16):
44
- ggml_version = "v3"
45
-
46
  if not os.path.isdir(model):
47
  print(f"Model not found at {model}. Downloading...")
48
  try:
@@ -63,17 +64,20 @@ def main(model, outname, outdir, quants, keep_fp16):
63
  print("Building llama.cpp")
64
  subprocess.run(f"cd .. && make quantize", shell=True, check=True)
65
 
66
- fp16 = f"{outdir}/{outname}.ggml{ggml_version}.fp16.bin"
67
 
68
- print(f"Making unquantised GGML at {fp16}")
69
  if not os.path.isfile(fp16):
70
- subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
 
 
 
71
  else:
72
  print(f"Unquantised GGML already exists at: {fp16}")
73
 
74
  print("Making quants")
75
  for type in quants:
76
- outfile = f"{outdir}/{outname}.ggml{ggml_version}.{type}.bin"
77
  print(f"Making {type} : {outfile}")
78
  subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
79
 
@@ -81,8 +85,9 @@ def main(model, outname, outdir, quants, keep_fp16):
81
  os.remove(fp16)
82
 
83
  if __name__ == "__main__":
84
- parser = argparse.ArgumentParser(description='Convert/Quantize HF to GGML. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
85
- parser.add_argument('--model', required=True, help='Downloaded model dir or Hugging Face model repo name')
 
86
  parser.add_argument('--outname', default=None, help='Output model(s) name')
87
  parser.add_argument('--outdir', default=None, help='Output directory')
88
  parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
@@ -90,4 +95,4 @@ if __name__ == "__main__":
90
 
91
  args = parser.parse_args()
92
 
93
- main(args.model, args.outname, args.outdir, args.quants, args.keep_fp16)
 
1
  #!/usr/bin/env python3
2
  """
3
+ This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.
4
 
5
  Usage:
6
+ python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
7
 
8
  Arguments:
9
+ - model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
10
+ - --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
11
  - --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
12
  - --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
13
  - --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
14
  - --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
15
 
16
+ Old quant types (some base model types require these):
17
  - Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
18
  - Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
19
  - Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
20
  - Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
21
+
22
+ New quant types (recommended):
23
  - Q2_K: smallest, extreme quality loss - not recommended
24
  - Q3_K: alias for Q3_K_M
25
  - Q3_K_S: very small, very high quality loss
 
43
  import os
44
  from huggingface_hub import snapshot_download
45
 
46
+ def main(model, model_type, outname, outdir, quants, keep_fp16):
 
 
47
  if not os.path.isdir(model):
48
  print(f"Model not found at {model}. Downloading...")
49
  try:
 
64
  print("Building llama.cpp")
65
  subprocess.run(f"cd .. && make quantize", shell=True, check=True)
66
 
67
+ fp16 = f"{outdir}/{outname}.gguf.fp16.bin"
68
 
69
+ print(f"Making unquantised GGUF at {fp16}")
70
  if not os.path.isfile(fp16):
71
+ if model_type != "llama":
72
+ subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
73
+ else:
74
+ subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
75
  else:
76
  print(f"Unquantised GGML already exists at: {fp16}")
77
 
78
  print("Making quants")
79
  for type in quants:
80
+ outfile = f"{outdir}/{outname}.gguf.{type}.bin"
81
  print(f"Making {type} : {outfile}")
82
  subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
83
 
 
85
  os.remove(fp16)
86
 
87
  if __name__ == "__main__":
88
+ parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
89
+ parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
90
+ parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
91
  parser.add_argument('--outname', default=None, help='Output model(s) name')
92
  parser.add_argument('--outdir', default=None, help='Output directory')
93
  parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
 
95
 
96
  args = parser.parse_args()
97
 
98
+ main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)
examples/parallel/CMakeLists.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ set(TARGET parallel)
2
+ add_executable(${TARGET} parallel.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
6
+ if(TARGET BUILD_INFO)
7
+ add_dependencies(${TARGET} BUILD_INFO)
8
+ endif()
examples/parallel/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # llama.cpp/example/parallel
2
+
3
+ Simplified simluation for serving incoming requests in parallel
examples/parallel/parallel.cpp ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // A basic application simulating a server with multiple clients.
2
+ // The clients submite requests to the server and they are processed in parallel.
3
+
4
+ #include "build-info.h"
5
+
6
+ #include "common.h"
7
+ #include "llama.h"
8
+
9
+ #include <cmath>
10
+ #include <cstdio>
11
+ #include <string>
12
+ #include <vector>
13
+
14
+ // trim whitespace from the beginning and end of a string
15
+ static std::string trim(const std::string & str) {
16
+ size_t start = 0;
17
+ size_t end = str.size();
18
+
19
+ while (start < end && isspace(str[start])) {
20
+ start += 1;
21
+ }
22
+
23
+ while (end > start && isspace(str[end - 1])) {
24
+ end -= 1;
25
+ }
26
+
27
+ return str.substr(start, end - start);
28
+ }
29
+
30
+ static std::string k_system =
31
+ R"(Transcript of a never ending dialog, where the User interacts with an Assistant.
32
+ The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
33
+
34
+ User: Recommend a nice restaurant in the area.
35
+ Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
36
+ User: Who is Richard Feynman?
37
+ Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
38
+ User:)";
39
+
40
+ static std::vector<std::string> k_prompts = {
41
+ "What is the meaning of life?",
42
+ "Tell me an interesting fact about llamas.",
43
+ "What is the best way to cook a steak?",
44
+ "Are you familiar with the Special Theory of Relativity and can you explain it to me?",
45
+ "Recommend some interesting books to read.",
46
+ "What is the best way to learn a new language?",
47
+ "How to get a job at Google?",
48
+ "If you could have any superpower, what would it be?",
49
+ "I want to learn how to play the piano.",
50
+ };
51
+
52
+ struct client {
53
+ int32_t id = 0;
54
+
55
+ llama_seq_id seq_id = -1;
56
+
57
+ llama_token sampled;
58
+
59
+ int64_t t_start_prompt;
60
+ int64_t t_start_gen;
61
+
62
+ int32_t n_prompt = 0;
63
+ int32_t n_decoded = 0;
64
+ int32_t i_batch = -1;
65
+
66
+ std::string input;
67
+ std::string prompt;
68
+ std::string response;
69
+
70
+ std::vector<llama_token> tokens_prev;
71
+ };
72
+
73
+ int main(int argc, char ** argv) {
74
+ srand(1234);
75
+
76
+ gpt_params params;
77
+
78
+ if (gpt_params_parse(argc, argv, params) == false) {
79
+ return 1;
80
+ }
81
+
82
+ // number of simultaneous "clients" to simulate
83
+ const int32_t n_clients = params.n_parallel;
84
+
85
+ // requests to simulate
86
+ const int32_t n_seq = params.n_sequences;
87
+
88
+ // insert new requests as soon as the previous one is done
89
+ const bool cont_batching = params.cont_batching;
90
+
91
+ #ifndef LOG_DISABLE_LOGS
92
+ log_set_target(log_filename_generator("parallel", "log"));
93
+ LOG_TEE("Log start\n");
94
+ log_dump_cmdline(argc, argv);
95
+ #endif // LOG_DISABLE_LOGS
96
+
97
+ // init llama.cpp
98
+ llama_backend_init(params.numa);
99
+
100
+ llama_model * model = NULL;
101
+ llama_context * ctx = NULL;
102
+
103
+ // load the target model
104
+ params.logits_all = true;
105
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
106
+
107
+ fprintf(stderr, "\n\n");
108
+ fflush(stderr);
109
+
110
+ const int n_ctx = llama_n_ctx(ctx);
111
+ const int n_vocab = llama_n_vocab(model);
112
+
113
+ std::vector<client> clients(n_clients);
114
+ for (size_t i = 0; i < clients.size(); ++i) {
115
+ auto & client = clients[i];
116
+ client.id = i;
117
+ client.tokens_prev.resize(std::max(256, params.n_predict));
118
+ std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
119
+ }
120
+
121
+ std::vector<llama_token_data> candidates;
122
+ candidates.reserve(n_vocab);
123
+
124
+ std::vector<llama_token> tokens_system;
125
+ tokens_system = ::llama_tokenize(ctx, k_system, true);
126
+ const int32_t n_tokens_system = tokens_system.size();
127
+
128
+ llama_seq_id g_seq_id = 0;
129
+
130
+ // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
131
+ // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
132
+ llama_batch batch = llama_batch_init(params.n_ctx, 0);
133
+
134
+ int32_t n_total_prompt = 0;
135
+ int32_t n_total_gen = 0;
136
+ int32_t n_cache_miss = 0;
137
+
138
+ const auto t_main_start = ggml_time_us();
139
+
140
+ LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
141
+ LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
142
+ LOG_TEE("\n");
143
+
144
+ {
145
+ LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
146
+
147
+ batch.n_tokens = n_tokens_system;
148
+
149
+ for (int32_t i = 0; i < batch.n_tokens; ++i) {
150
+ batch.token[i] = tokens_system[i];
151
+ batch.pos[i] = i;
152
+ batch.seq_id[i] = 0;
153
+ batch.logits[i] = false;
154
+ }
155
+
156
+ if (llama_decode(ctx, batch) != 0) {
157
+ LOG_TEE("%s: llama_decode() failed\n", __func__);
158
+ return 1;
159
+ }
160
+
161
+ // assign the system KV cache to all parallel sequences
162
+ for (int32_t i = 1; i < n_clients; ++i) {
163
+ llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system);
164
+ }
165
+
166
+ LOG_TEE("\n");
167
+ }
168
+
169
+ LOG_TEE("Processing requests ...\n\n");
170
+
171
+ while (true) {
172
+ batch.n_tokens = 0;
173
+
174
+ // decode any currently ongoing sequences
175
+ for (auto & client : clients) {
176
+ if (client.seq_id == -1) {
177
+ continue;
178
+ }
179
+
180
+ batch.token [batch.n_tokens] = client.sampled;
181
+ batch.pos [batch.n_tokens] = n_tokens_system + client.n_prompt + client.n_decoded;
182
+ batch.seq_id[batch.n_tokens] = client.id;
183
+ batch.logits[batch.n_tokens] = true;
184
+
185
+ client.n_decoded += 1;
186
+ client.i_batch = batch.n_tokens;
187
+
188
+ batch.n_tokens += 1;
189
+ }
190
+
191
+ if (batch.n_tokens == 0) {
192
+ // all sequences have ended - clear the entire KV cache
193
+ for (int i = 0; i < n_clients; ++i) {
194
+ llama_kv_cache_seq_rm(ctx, i, n_tokens_system, -1);
195
+ }
196
+
197
+ LOG_TEE("%s: clearing the KV cache\n", __func__);
198
+ }
199
+
200
+ // insert new sequences for decoding
201
+ if (cont_batching || batch.n_tokens == 0) {
202
+ for (auto & client : clients) {
203
+ if (client.seq_id == -1 && g_seq_id < n_seq) {
204
+ client.seq_id = g_seq_id;
205
+
206
+ client.t_start_prompt = ggml_time_us();
207
+ client.t_start_gen = 0;
208
+
209
+ client.input = k_prompts[rand() % k_prompts.size()];
210
+ client.prompt = client.input + "\nAssistant:";
211
+ client.response = "";
212
+
213
+ std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
214
+
215
+ // do not prepend BOS because we have a system prompt!
216
+ std::vector<llama_token> tokens_prompt;
217
+ tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
218
+
219
+ for (size_t i = 0; i < tokens_prompt.size(); ++i) {
220
+ batch.token [batch.n_tokens] = tokens_prompt[i];
221
+ batch.pos [batch.n_tokens] = i + n_tokens_system;
222
+ batch.seq_id[batch.n_tokens] = client.id;
223
+ batch.logits[batch.n_tokens] = false;
224
+ batch.n_tokens += 1;
225
+ }
226
+
227
+ // extract the logits only for the last token
228
+ if (batch.n_tokens > 0) {
229
+ batch.logits[batch.n_tokens - 1] = true;
230
+ }
231
+
232
+ client.n_prompt = tokens_prompt.size();
233
+ client.n_decoded = 0;
234
+ client.i_batch = batch.n_tokens - 1;
235
+
236
+ LOG_TEE("\033[1mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
237
+
238
+ g_seq_id += 1;
239
+
240
+ // insert new requests one-by-one
241
+ //if (cont_batching) {
242
+ // break;
243
+ //}
244
+ }
245
+ }
246
+ }
247
+
248
+ if (batch.n_tokens == 0) {
249
+ break;
250
+ }
251
+
252
+ // process in chunks of params.n_batch
253
+ int32_t n_batch = params.n_batch;
254
+
255
+ for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
256
+ // experiment: process in powers of 2
257
+ //if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
258
+ // n_batch /= 2;
259
+ // i -= n_batch;
260
+ // continue;
261
+ //}
262
+
263
+ const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
264
+
265
+ llama_batch batch_view = {
266
+ n_tokens,
267
+ batch.token + i,
268
+ nullptr,
269
+ batch.pos + i,
270
+ batch.seq_id + i,
271
+ batch.logits + i,
272
+ 0, 0, 0, // unused
273
+ };
274
+
275
+ const int ret = llama_decode(ctx, batch_view);
276
+ if (ret != 0) {
277
+ if (n_batch == 1 || ret < 0) {
278
+ // if you get here, it means the KV cache is full - try increasing it via the context size
279
+ LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
280
+ return 1;
281
+ }
282
+
283
+ LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
284
+
285
+ n_cache_miss += 1;
286
+
287
+ // retry with half the batch size to try to find a free slot in the KV cache
288
+ n_batch /= 2;
289
+ i -= n_batch;
290
+
291
+ continue;
292
+ }
293
+
294
+ LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
295
+
296
+ for (auto & client : clients) {
297
+ if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
298
+ continue;
299
+ }
300
+
301
+ //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
302
+ // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
303
+
304
+ const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i);
305
+
306
+ if (client.n_decoded == 1) {
307
+ // start measuring generation time after the first token to make sure all concurrent clients
308
+ // have their prompt already processed
309
+ client.t_start_gen = ggml_time_us();
310
+ }
311
+
312
+ // remember which tokens were sampled - used for repetition penalties during sampling
313
+ client.tokens_prev.erase(client.tokens_prev.begin());
314
+ client.tokens_prev.push_back(id);
315
+
316
+ const std::string token_str = llama_token_to_piece(ctx, id);
317
+ client.response += token_str;
318
+ client.sampled = id;
319
+
320
+ //printf("client %d, seq %d, token %d, pos %d, batch %d: %s\n",
321
+ // client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
322
+
323
+ if (client.n_decoded > 2 &&
324
+ (id == llama_token_eos(ctx) ||
325
+ (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
326
+ client.response.find("User:") != std::string::npos ||
327
+ client.response.find('\n') != std::string::npos)) {
328
+ // basic reverse prompt
329
+ const size_t pos = client.response.find("User:");
330
+ if (pos != std::string::npos) {
331
+ client.response = client.response.substr(0, pos);
332
+ }
333
+
334
+ // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
335
+ llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, n_ctx);
336
+
337
+ const auto t_main_end = ggml_time_us();
338
+
339
+ LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\nResponse: %s\n\n",
340
+ client.id, client.seq_id, client.n_prompt, client.n_decoded,
341
+ (t_main_end - client.t_start_prompt) / 1e6,
342
+ (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
343
+ n_cache_miss,
344
+ ::trim(client.input).c_str(),
345
+ ::trim(client.response).c_str());
346
+
347
+ n_total_prompt += client.n_prompt;
348
+ n_total_gen += client.n_decoded;
349
+
350
+ client.seq_id = -1;
351
+ }
352
+
353
+ client.i_batch = -1;
354
+ }
355
+ }
356
+ }
357
+
358
+ const auto t_main_end = ggml_time_us();
359
+
360
+ LOG_TEE("\n\n");
361
+ LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
362
+ LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
363
+ LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
364
+ LOG_TEE("Cache misses: %6d\n", n_cache_miss);
365
+
366
+ LOG_TEE("\n\n");
367
+
368
+ llama_print_timings(ctx);
369
+
370
+ llama_batch_free(batch);
371
+
372
+ llama_free(ctx);
373
+ llama_free_model(model);
374
+
375
+ llama_backend_free();
376
+
377
+ fprintf(stderr, "\n\n");
378
+
379
+ return 0;
380
+ }
examples/perplexity/README.md CHANGED
@@ -1,3 +1,21 @@
1
  # perplexity
2
 
3
  TODO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # perplexity
2
 
3
  TODO
4
+
5
+ ## Llama 2 70B Scorechart
6
+ Quantization | Model size (GiB) | Perplexity | Delta to fp16
7
+ -- | -- | -- | --
8
+ Q4_0 | 36.20 | 3.5550 | 3.61%
9
+ Q4_1 | 40.20 | 3.5125 | 2.37%
10
+ Q5_0 | 44.20 | 3.4744 | 1.26%
11
+ Q2_K | 27.27 | 3.7339 | 8.82%
12
+ Q3_K_S | 27.86 | 3.7019 | 7.89%
13
+ Q3_K_M | 30.83 | 3.5932 | 4.72%
14
+ Q3_K_L | 33.67 | 3.5617 | 3.80%
15
+ Q4_K_S | 36.39 | 3.4852 | 1.57%
16
+ Q4_K_M | 38.54 | 3.4725 | 1.20%
17
+ Q5_K_S | 44.20 | 3.4483 | 0.50%
18
+ Q5_K_M | 45.41 | 3.4451 | 0.40%
19
+ Q6_K | 52.70 | 3.4367 | 0.16%
20
+ fp16 | 128.5 | 3.4313 | -
21
+
examples/perplexity/perplexity.cpp CHANGED
@@ -80,7 +80,9 @@ static void write_logfile(
80
  static std::vector<float> softmax(const std::vector<float>& logits) {
81
  std::vector<float> probs(logits.size());
82
  float max_logit = logits[0];
83
- for (float v : logits) max_logit = std::max(max_logit, v);
 
 
84
  double sum_exp = 0.0;
85
  for (size_t i = 0; i < logits.size(); i++) {
86
  // Subtract the maximum logit value from the current logit value for numerical stability
@@ -89,15 +91,21 @@ static std::vector<float> softmax(const std::vector<float>& logits) {
89
  sum_exp += exp_logit;
90
  probs[i] = exp_logit;
91
  }
92
- for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
 
 
93
  return probs;
94
  }
95
 
96
  static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
97
  float max_logit = logits[0];
98
- for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
 
 
99
  double sum_exp = 0.0;
100
- for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit);
 
 
101
  return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
102
  }
103
 
@@ -108,7 +116,8 @@ static void process_logits(
108
  std::mutex mutex;
109
  int counter = 0;
110
  auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
111
- double local_nll = 0, local_nll2 = 0;
 
112
  while (true) {
113
  std::unique_lock<std::mutex> lock(mutex);
114
  int i = counter++;
@@ -126,10 +135,13 @@ static void process_logits(
126
  prob_history[i] = results.prob;
127
  }
128
  };
129
- for (auto & w : workers) w = std::thread(compute);
 
 
130
  compute();
131
- for (auto & w : workers) w.join();
132
-
 
133
  }
134
 
135
  static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
@@ -138,22 +150,24 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
138
  // Output: `perplexity: 13.5106 [114/114]`
139
  // BOS tokens will be added for each chunk before eval
140
 
141
- const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
142
  const bool add_bos = is_spm;
143
 
144
  fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
145
 
146
  std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
147
 
148
- if (int(tokens.size()) < 2*params.n_ctx) {
149
- fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx,
150
- params.n_ctx);
 
 
151
  fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
152
  return {std::move(tokens), 0., {}, {}};
153
  }
154
 
155
- std::vector<float> logit_history;
156
- std::vector<float> prob_history;
157
 
158
  logit_history.resize(tokens.size());
159
  prob_history.resize(tokens.size());
@@ -163,20 +177,20 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
163
  return {tokens, -1, logit_history, prob_history};
164
  }
165
 
166
- const int calc_chunk = params.n_ctx;
167
 
168
  fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
169
 
170
  if (int(tokens.size()) <= calc_chunk) {
171
  fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
172
- tokens.size(), params.n_ctx, params.ppl_stride);
173
  return {tokens, -1, logit_history, prob_history};
174
  }
175
 
176
  const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride;
177
 
178
  const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
179
- const int n_vocab = llama_n_vocab(ctx);
180
  const int n_batch = params.n_batch;
181
 
182
  int count = 0;
@@ -195,12 +209,15 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
195
 
196
  const auto t_start = std::chrono::high_resolution_clock::now();
197
 
 
 
 
198
  for (int j = 0; j < num_batches; ++j) {
199
  const int batch_start = start + j * n_batch;
200
  const int batch_size = std::min(end - batch_start, n_batch);
201
 
202
  //fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
203
- if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
204
  //fprintf(stderr, "%s : failed to eval\n", __func__);
205
  return {tokens, -1, logit_history, prob_history};
206
  }
@@ -235,7 +252,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
235
  }
236
 
237
  //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
238
- for (int j = params.n_ctx - params.ppl_stride - 1; j < params.n_ctx - 1; ++j) {
239
 
240
  // Calculate probability of next token, given the previous ones.
241
  const std::vector<float> tok_logits(
@@ -272,8 +289,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
272
  // Output: `perplexity: 13.5106 [114/114]`
273
  // BOS tokens will be added for each chunk before eval
274
 
275
- const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
276
  const bool add_bos = is_spm;
 
277
 
278
  auto tim1 = std::chrono::high_resolution_clock::now();
279
  fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
@@ -283,9 +301,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
283
  auto tim2 = std::chrono::high_resolution_clock::now();
284
  fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
285
 
286
- if (int(tokens.size()) < 2*params.n_ctx) {
287
- fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx,
288
- params.n_ctx);
289
  fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
290
  return {std::move(tokens), 0., {}, {}};
291
  }
@@ -296,10 +314,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
296
  std::vector<float> prob_history;
297
  prob_history.resize(tokens.size());
298
 
299
- const int n_chunk_max = tokens.size() / params.n_ctx;
300
 
301
  const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
302
- const int n_vocab = llama_n_vocab(ctx);
303
  const int n_batch = params.n_batch;
304
 
305
  int count = 0;
@@ -311,15 +329,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
311
  std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
312
 
313
  for (int i = 0; i < n_chunk; ++i) {
314
- const int start = i * params.n_ctx;
315
- const int end = start + params.n_ctx;
316
 
317
- const int num_batches = (params.n_ctx + n_batch - 1) / n_batch;
318
 
319
  std::vector<float> logits;
320
 
321
  const auto t_start = std::chrono::high_resolution_clock::now();
322
 
 
 
 
323
  for (int j = 0; j < num_batches; ++j) {
324
  const int batch_start = start + j * n_batch;
325
  const int batch_size = std::min(end - batch_start, n_batch);
@@ -332,7 +353,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
332
  tokens[batch_start] = llama_token_bos(ctx);
333
  }
334
 
335
- if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
336
  fprintf(stderr, "%s : failed to eval\n", __func__);
337
  return {tokens, -1, logit_history, prob_history};
338
  }
@@ -340,7 +361,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
340
  // restore the original token in case it was set to BOS
341
  tokens[batch_start] = token_org;
342
 
343
- const auto batch_logits = llama_get_logits(ctx);
344
  logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
345
  }
346
 
@@ -369,10 +390,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
369
  // Example, we have a context window of 512, we will compute perplexity for each of the
370
  // last 256 tokens. Then, we split the input up into context window size chunks to
371
  // process the entire prompt.
372
- const int first = params.n_ctx/2;
373
- process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first,
374
  workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
375
- count += params.n_ctx - first - 1;
376
 
377
  // perplexity is e^(average negative log-likelihood)
378
  if (params.ppl_output_type == 0) {
@@ -381,7 +402,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
381
  double av = nll/count;
382
  double av2 = nll2/count - av*av;
383
  if (av2 > 0) av2 = sqrt(av2/(count-1));
384
- printf("%8d %.4lf %4lf %4lf\n", i*params.n_ctx, std::exp(nll / count), av, av2);
385
  }
386
  fflush(stdout);
387
  }
@@ -402,7 +423,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
402
  }
403
 
404
  static std::vector<float> hellaswag_evaluate_tokens(
405
- llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, int n_vocab, int n_thread
406
  ) {
407
  std::vector<float> result;
408
  result.reserve(tokens.size() * n_vocab);
@@ -410,7 +431,7 @@ static std::vector<float> hellaswag_evaluate_tokens(
410
  for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
411
  size_t n_tokens = tokens.size() - i_chunk * n_batch;
412
  n_tokens = std::min(n_tokens, size_t(n_batch));
413
- if (llama_eval(ctx, tokens.data() + i_chunk * n_batch, n_tokens, n_past, n_thread)) {
414
  fprintf(stderr, "%s : failed to eval\n", __func__);
415
  return {};
416
  }
@@ -457,7 +478,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
457
  size_t hs_task_count = prompt_lines.size()/6;
458
  fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
459
 
460
- const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
461
  fprintf(stderr, "================================= is_spm = %d\n", is_spm);
462
 
463
  // This is needed as usual for LLaMA models
@@ -512,7 +533,8 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
512
  printf("\ntask\tacc_norm\n");
513
 
514
  double acc = 0.0f;
515
- const int n_vocab = llama_n_vocab(ctx);
 
516
 
517
  std::vector<std::vector<int>> ending_tokens(4);
518
 
@@ -540,7 +562,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
540
  auto query_size = query_embd.size();
541
 
542
  // Stop if query wont fit the ctx window
543
- if (query_size > (size_t)params.n_ctx) {
544
  fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
545
  return;
546
  }
@@ -550,7 +572,10 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
550
  query_embd.resize(32);
551
  }
552
 
553
- auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads);
 
 
 
554
  if (logits.empty()) {
555
  fprintf(stderr, "%s : failed to eval\n", __func__);
556
  return;
@@ -587,7 +612,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
587
  query_size = query_embd.size();
588
 
589
  // Stop if query wont fit the ctx window
590
- if (context_size + query_size > (size_t)params.n_ctx) {
591
  fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
592
  return;
593
  }
@@ -599,7 +624,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
599
  //}
600
 
601
  // Evaluate the query
602
- logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab, params.n_threads);
603
  if (logits.empty()) {
604
  fprintf(stderr, "%s : failed to eval\n", __func__);
605
  return;
@@ -661,7 +686,7 @@ int main(int argc, char ** argv) {
661
  return 1;
662
  }
663
 
664
- params.perplexity = true;
665
  params.n_batch = std::min(params.n_batch, params.n_ctx);
666
 
667
  if (params.ppl_stride > 0) {
@@ -695,7 +720,7 @@ int main(int argc, char ** argv) {
695
  return 1;
696
  }
697
 
698
- const int n_ctx_train = llama_n_ctx_train(ctx);
699
  if (params.n_ctx > n_ctx_train) {
700
  fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
701
  __func__, n_ctx_train, params.n_ctx);
@@ -704,8 +729,7 @@ int main(int argc, char ** argv) {
704
  // print system information
705
  {
706
  fprintf(stderr, "\n");
707
- fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
708
- params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
709
  }
710
 
711
  struct results_perplexity results;
 
80
  static std::vector<float> softmax(const std::vector<float>& logits) {
81
  std::vector<float> probs(logits.size());
82
  float max_logit = logits[0];
83
+ for (float v : logits) {
84
+ max_logit = std::max(max_logit, v);
85
+ }
86
  double sum_exp = 0.0;
87
  for (size_t i = 0; i < logits.size(); i++) {
88
  // Subtract the maximum logit value from the current logit value for numerical stability
 
91
  sum_exp += exp_logit;
92
  probs[i] = exp_logit;
93
  }
94
+ for (size_t i = 0; i < probs.size(); i++) {
95
+ probs[i] /= sum_exp;
96
+ }
97
  return probs;
98
  }
99
 
100
  static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
101
  float max_logit = logits[0];
102
+ for (int i = 1; i < n_vocab; ++i) {
103
+ max_logit = std::max(max_logit, logits[i]);
104
+ }
105
  double sum_exp = 0.0;
106
+ for (int i = 0; i < n_vocab; ++i) {
107
+ sum_exp += expf(logits[i] - max_logit);
108
+ }
109
  return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
110
  }
111
 
 
116
  std::mutex mutex;
117
  int counter = 0;
118
  auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
119
+ double local_nll = 0;
120
+ double local_nll2 = 0;
121
  while (true) {
122
  std::unique_lock<std::mutex> lock(mutex);
123
  int i = counter++;
 
135
  prob_history[i] = results.prob;
136
  }
137
  };
138
+ for (auto & w : workers) {
139
+ w = std::thread(compute);
140
+ }
141
  compute();
142
+ for (auto & w : workers) {
143
+ w.join();
144
+ }
145
  }
146
 
147
  static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
 
150
  // Output: `perplexity: 13.5106 [114/114]`
151
  // BOS tokens will be added for each chunk before eval
152
 
153
+ const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
154
  const bool add_bos = is_spm;
155
 
156
  fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
157
 
158
  std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
159
 
160
+ const int n_ctx = llama_n_ctx(ctx);
161
+
162
+ if (int(tokens.size()) < 2*n_ctx) {
163
+ fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
164
+ n_ctx);
165
  fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
166
  return {std::move(tokens), 0., {}, {}};
167
  }
168
 
169
+ std::vector<float> logit_history;
170
+ std::vector<float> prob_history;
171
 
172
  logit_history.resize(tokens.size());
173
  prob_history.resize(tokens.size());
 
177
  return {tokens, -1, logit_history, prob_history};
178
  }
179
 
180
+ const int calc_chunk = n_ctx;
181
 
182
  fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
183
 
184
  if (int(tokens.size()) <= calc_chunk) {
185
  fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
186
+ tokens.size(), n_ctx, params.ppl_stride);
187
  return {tokens, -1, logit_history, prob_history};
188
  }
189
 
190
  const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride;
191
 
192
  const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
193
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx));
194
  const int n_batch = params.n_batch;
195
 
196
  int count = 0;
 
209
 
210
  const auto t_start = std::chrono::high_resolution_clock::now();
211
 
212
+ // clear the KV cache
213
+ llama_kv_cache_tokens_rm(ctx, -1, -1);
214
+
215
  for (int j = 0; j < num_batches; ++j) {
216
  const int batch_start = start + j * n_batch;
217
  const int batch_size = std::min(end - batch_start, n_batch);
218
 
219
  //fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
220
+ if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
221
  //fprintf(stderr, "%s : failed to eval\n", __func__);
222
  return {tokens, -1, logit_history, prob_history};
223
  }
 
252
  }
253
 
254
  //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
255
+ for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
256
 
257
  // Calculate probability of next token, given the previous ones.
258
  const std::vector<float> tok_logits(
 
289
  // Output: `perplexity: 13.5106 [114/114]`
290
  // BOS tokens will be added for each chunk before eval
291
 
292
+ const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
293
  const bool add_bos = is_spm;
294
+ const int n_ctx = llama_n_ctx(ctx);
295
 
296
  auto tim1 = std::chrono::high_resolution_clock::now();
297
  fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
 
301
  auto tim2 = std::chrono::high_resolution_clock::now();
302
  fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
303
 
304
+ if (int(tokens.size()) < 2*n_ctx) {
305
+ fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
306
+ n_ctx);
307
  fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
308
  return {std::move(tokens), 0., {}, {}};
309
  }
 
314
  std::vector<float> prob_history;
315
  prob_history.resize(tokens.size());
316
 
317
+ const int n_chunk_max = tokens.size() / n_ctx;
318
 
319
  const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
320
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx));
321
  const int n_batch = params.n_batch;
322
 
323
  int count = 0;
 
329
  std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
330
 
331
  for (int i = 0; i < n_chunk; ++i) {
332
+ const int start = i * n_ctx;
333
+ const int end = start + n_ctx;
334
 
335
+ const int num_batches = (n_ctx + n_batch - 1) / n_batch;
336
 
337
  std::vector<float> logits;
338
 
339
  const auto t_start = std::chrono::high_resolution_clock::now();
340
 
341
+ // clear the KV cache
342
+ llama_kv_cache_tokens_rm(ctx, -1, -1);
343
+
344
  for (int j = 0; j < num_batches; ++j) {
345
  const int batch_start = start + j * n_batch;
346
  const int batch_size = std::min(end - batch_start, n_batch);
 
353
  tokens[batch_start] = llama_token_bos(ctx);
354
  }
355
 
356
+ if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
357
  fprintf(stderr, "%s : failed to eval\n", __func__);
358
  return {tokens, -1, logit_history, prob_history};
359
  }
 
361
  // restore the original token in case it was set to BOS
362
  tokens[batch_start] = token_org;
363
 
364
+ const auto * batch_logits = llama_get_logits(ctx);
365
  logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
366
  }
367
 
 
390
  // Example, we have a context window of 512, we will compute perplexity for each of the
391
  // last 256 tokens. Then, we split the input up into context window size chunks to
392
  // process the entire prompt.
393
+ const int first = n_ctx/2;
394
+ process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
395
  workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
396
+ count += n_ctx - first - 1;
397
 
398
  // perplexity is e^(average negative log-likelihood)
399
  if (params.ppl_output_type == 0) {
 
402
  double av = nll/count;
403
  double av2 = nll2/count - av*av;
404
  if (av2 > 0) av2 = sqrt(av2/(count-1));
405
+ printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
406
  }
407
  fflush(stdout);
408
  }
 
423
  }
424
 
425
  static std::vector<float> hellaswag_evaluate_tokens(
426
+ llama_context * ctx, std::vector<int> & tokens, int n_past, int n_batch, int n_vocab
427
  ) {
428
  std::vector<float> result;
429
  result.reserve(tokens.size() * n_vocab);
 
431
  for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
432
  size_t n_tokens = tokens.size() - i_chunk * n_batch;
433
  n_tokens = std::min(n_tokens, size_t(n_batch));
434
+ if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0))) {
435
  fprintf(stderr, "%s : failed to eval\n", __func__);
436
  return {};
437
  }
 
478
  size_t hs_task_count = prompt_lines.size()/6;
479
  fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
480
 
481
+ const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
482
  fprintf(stderr, "================================= is_spm = %d\n", is_spm);
483
 
484
  // This is needed as usual for LLaMA models
 
533
  printf("\ntask\tacc_norm\n");
534
 
535
  double acc = 0.0f;
536
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx));
537
+ const int n_ctx = llama_n_ctx(ctx);
538
 
539
  std::vector<std::vector<int>> ending_tokens(4);
540
 
 
562
  auto query_size = query_embd.size();
563
 
564
  // Stop if query wont fit the ctx window
565
+ if (query_size > (size_t)n_ctx) {
566
  fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
567
  return;
568
  }
 
572
  query_embd.resize(32);
573
  }
574
 
575
+ // clear the KV cache
576
+ llama_kv_cache_tokens_rm(ctx, -1, -1);
577
+
578
+ auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
579
  if (logits.empty()) {
580
  fprintf(stderr, "%s : failed to eval\n", __func__);
581
  return;
 
612
  query_size = query_embd.size();
613
 
614
  // Stop if query wont fit the ctx window
615
+ if (context_size + query_size > (size_t)n_ctx) {
616
  fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
617
  return;
618
  }
 
624
  //}
625
 
626
  // Evaluate the query
627
+ logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab);
628
  if (logits.empty()) {
629
  fprintf(stderr, "%s : failed to eval\n", __func__);
630
  return;
 
686
  return 1;
687
  }
688
 
689
+ params.logits_all = true;
690
  params.n_batch = std::min(params.n_batch, params.n_ctx);
691
 
692
  if (params.ppl_stride > 0) {
 
720
  return 1;
721
  }
722
 
723
+ const int n_ctx_train = llama_n_ctx_train(model);
724
  if (params.n_ctx > n_ctx_train) {
725
  fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
726
  __func__, n_ctx_train, params.n_ctx);
 
729
  // print system information
730
  {
731
  fprintf(stderr, "\n");
732
+ fprintf(stderr, "%s\n", get_system_info(params).c_str());
 
733
  }
734
 
735
  struct results_perplexity results;
examples/quantize-stats/quantize-stats.cpp CHANGED
@@ -309,21 +309,22 @@ int main(int argc, char ** argv) {
309
  llama_context * ctx;
310
 
311
  {
312
- auto lparams = llama_context_default_params();
 
313
 
314
- lparams.n_ctx = 256;
315
- lparams.seed = 1;
316
- lparams.f16_kv = false;
317
- lparams.use_mlock = false;
318
-
319
- model = llama_load_model_from_file(params.model.c_str(), lparams);
320
 
321
  if (model == NULL) {
322
  fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
323
  return 1;
324
  }
325
 
326
- ctx = llama_new_context_with_model(model, lparams);
 
 
 
 
 
327
 
328
  if (ctx == NULL) {
329
  fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
 
309
  llama_context * ctx;
310
 
311
  {
312
+ auto mparams = llama_model_default_params();
313
+ mparams.use_mlock = false;
314
 
315
+ model = llama_load_model_from_file(params.model.c_str(), mparams);
 
 
 
 
 
316
 
317
  if (model == NULL) {
318
  fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
319
  return 1;
320
  }
321
 
322
+ auto cparams = llama_context_default_params();
323
+ cparams.n_ctx = 256;
324
+ cparams.seed = 1;
325
+ cparams.f16_kv = false;
326
+
327
+ ctx = llama_new_context_with_model(model, cparams);
328
 
329
  if (ctx == NULL) {
330
  fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
examples/quantize/README.md CHANGED
@@ -1,3 +1,44 @@
1
  # quantize
2
 
3
  TODO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # quantize
2
 
3
  TODO
4
+
5
+ ## Llama 2 7B
6
+
7
+ Quantization | Bits per Weight (BPW)
8
+ -- | --
9
+ Q2_K | 3.35
10
+ Q3_K_S | 3.50
11
+ Q3_K_M | 3.91
12
+ Q3_K_L | 4.27
13
+ Q4_K_S | 4.58
14
+ Q4_K_M | 4.84
15
+ Q5_K_S | 5.52
16
+ Q5_K_M | 5.68
17
+ Q6_K | 6.56
18
+
19
+ ## Llama 2 13B
20
+ Quantization | Bits per Weight (BPW)
21
+ -- | --
22
+ Q2_K | 3.34
23
+ Q3_K_S | 3.48
24
+ Q3_K_M | 3.89
25
+ Q3_K_L | 4.26
26
+ Q4_K_S | 4.56
27
+ Q4_K_M | 4.83
28
+ Q5_K_S | 5.51
29
+ Q5_K_M | 5.67
30
+ Q6_K | 6.56
31
+
32
+ # Llama 2 70B
33
+
34
+ Quantization | Bits per Weight (BPW)
35
+ -- | --
36
+ Q2_K | 3.40
37
+ Q3_K_S | 3.47
38
+ Q3_K_M | 3.85
39
+ Q3_K_L | 4.19
40
+ Q4_K_S | 4.53
41
+ Q4_K_M | 4.80
42
+ Q5_K_S | 5.50
43
+ Q5_K_M | 5.65
44
+ Q6_K | 6.56
examples/quantize/quantize.cpp CHANGED
@@ -72,6 +72,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
72
  // usage:
73
  // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
74
  //
 
75
  static void usage(const char * executable) {
76
  printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
77
  printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
 
72
  // usage:
73
  // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
74
  //
75
+ [[noreturn]]
76
  static void usage(const char * executable) {
77
  printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
78
  printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
examples/save-load-state/save-load-state.cpp CHANGED
@@ -23,23 +23,17 @@ int main(int argc, char ** argv) {
23
  params.n_predict = 16;
24
  }
25
 
26
- auto lparams = llama_context_default_params();
27
-
28
- lparams.n_ctx = params.n_ctx;
29
- lparams.seed = params.seed;
30
- lparams.f16_kv = params.memory_f16;
31
- lparams.use_mmap = params.use_mmap;
32
- lparams.use_mlock = params.use_mlock;
33
-
34
  auto n_past = 0;
35
  auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
36
 
37
  // init
38
- auto model = llama_load_model_from_file(params.model.c_str(), lparams);
 
 
 
39
  if (model == nullptr) {
40
  return 1;
41
  }
42
- auto ctx = llama_new_context_with_model(model, lparams);
43
  if (ctx == nullptr) {
44
  llama_free_model(model);
45
  return 1;
@@ -54,7 +48,7 @@ int main(int argc, char ** argv) {
54
  }
55
 
56
  // evaluate prompt
57
- llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
58
 
59
  last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
60
  n_past += n_prompt_tokens;
@@ -78,8 +72,8 @@ int main(int argc, char ** argv) {
78
  printf("\n%s", params.prompt.c_str());
79
 
80
  for (auto i = 0; i < params.n_predict; i++) {
81
- auto logits = llama_get_logits(ctx);
82
- auto n_vocab = llama_n_vocab(ctx);
83
  std::vector<llama_token_data> candidates;
84
  candidates.reserve(n_vocab);
85
  for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@@ -91,7 +85,7 @@ int main(int argc, char ** argv) {
91
  last_n_tokens_data.push_back(next_token);
92
 
93
  printf("%s", next_token_str.c_str());
94
- if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
95
  fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
96
  llama_free(ctx);
97
  llama_free_model(model);
@@ -106,7 +100,7 @@ int main(int argc, char ** argv) {
106
  llama_free(ctx);
107
 
108
  // make new context
109
- auto ctx2 = llama_new_context_with_model(model, lparams);
110
 
111
  // Load state (rng, logits, embedding and kv_cache) from file
112
  {
@@ -138,8 +132,8 @@ int main(int argc, char ** argv) {
138
 
139
  // second run
140
  for (auto i = 0; i < params.n_predict; i++) {
141
- auto logits = llama_get_logits(ctx2);
142
- auto n_vocab = llama_n_vocab(ctx2);
143
  std::vector<llama_token_data> candidates;
144
  candidates.reserve(n_vocab);
145
  for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@@ -151,7 +145,7 @@ int main(int argc, char ** argv) {
151
  last_n_tokens_data.push_back(next_token);
152
 
153
  printf("%s", next_token_str.c_str());
154
- if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
155
  fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
156
  llama_free(ctx2);
157
  llama_free_model(model);
 
23
  params.n_predict = 16;
24
  }
25
 
 
 
 
 
 
 
 
 
26
  auto n_past = 0;
27
  auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
28
 
29
  // init
30
+ llama_model * model;
31
+ llama_context * ctx;
32
+
33
+ std::tie(model, ctx) = llama_init_from_gpt_params( params );
34
  if (model == nullptr) {
35
  return 1;
36
  }
 
37
  if (ctx == nullptr) {
38
  llama_free_model(model);
39
  return 1;
 
48
  }
49
 
50
  // evaluate prompt
51
+ llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt_tokens, n_past, 0));
52
 
53
  last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
54
  n_past += n_prompt_tokens;
 
72
  printf("\n%s", params.prompt.c_str());
73
 
74
  for (auto i = 0; i < params.n_predict; i++) {
75
+ auto * logits = llama_get_logits(ctx);
76
+ auto n_vocab = llama_n_vocab(model);
77
  std::vector<llama_token_data> candidates;
78
  candidates.reserve(n_vocab);
79
  for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
 
85
  last_n_tokens_data.push_back(next_token);
86
 
87
  printf("%s", next_token_str.c_str());
88
+ if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
89
  fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
90
  llama_free(ctx);
91
  llama_free_model(model);
 
100
  llama_free(ctx);
101
 
102
  // make new context
103
+ auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
104
 
105
  // Load state (rng, logits, embedding and kv_cache) from file
106
  {
 
132
 
133
  // second run
134
  for (auto i = 0; i < params.n_predict; i++) {
135
+ auto * logits = llama_get_logits(ctx2);
136
+ auto n_vocab = llama_n_vocab(model);
137
  std::vector<llama_token_data> candidates;
138
  candidates.reserve(n_vocab);
139
  for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
 
145
  last_n_tokens_data.push_back(next_token);
146
 
147
  printf("%s", next_token_str.c_str());
148
+ if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
149
  fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
150
  llama_free(ctx2);
151
  llama_free_model(model);