robbiemu commited on Oct 13

Commit

5dadba4

•

1 Parent(s): 6a423d8

update for quantization

imatrix

record of change

q5_k_m

bf16

iq2_xs

iq3_m

iq3_s

iq3_xs

iq3_xxs

iq4_nl

iq4_xs

q3_k_l

q3_k_m

q3_k_s

q4_k_m

q4_k_s

q5_k_s

q6_k

q8_0

removing safetensors

Temporarily remove LFS tracking for salamandra_header.png

Remove problematic salamandra_header.png from the repository

Fully remove salamandra_header.png from cache and LFS tracking

Track large files with Git LFS

Remove salamandra_header.png from LFS tracking

Add salamandra_header.png to LFS

update git attributes

Ensure all LFS-tracked model files are added

removing safetensors

removing duplicate data

removing duplicate data

lfs the imatrix

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +24 -1
.gitignore +191 -0
IQ2_XS_log.txt +339 -0
IQ3_M_log.txt +339 -0
IQ3_S_log.txt +339 -0
IQ3_XS_log.txt +339 -0
IQ3_XXS_log.txt +339 -0
IQ4_NL_log.txt +266 -0
IQ4_XS_log.txt +339 -0
Q3_K_L_log.txt +339 -0
Q3_K_M_log.txt +339 -0
Q3_K_S_log.txt +339 -0
Q4_K_M_log.txt +339 -0
Q4_K_S_log.txt +339 -0
Q5_K_M_log.txt +339 -0
Q5_K_S_log.txt +339 -0
Q6_K_log.txt +339 -0
Q8_0_log.txt +266 -0
README.md +54 -1
bf16_log.txt +245 -0
git_snapshot.txt +3 -0
model.safetensors → imatrix/oscar/imatrix-dataset.txt +2 -2
tokenizer.model → imatrix/oscar/imatrix.dat +2 -2
imatrix/oscar/langs/bg.txt +3 -0
imatrix/oscar/langs/ca.txt +3 -0
imatrix/oscar/langs/cs.txt +3 -0
imatrix/oscar/langs/cy.txt +3 -0
imatrix/oscar/langs/da.txt +3 -0
imatrix/oscar/langs/de.txt +3 -0
imatrix/oscar/langs/el.txt +3 -0
imatrix/oscar/langs/en.txt +3 -0
imatrix/oscar/langs/es.txt +3 -0
imatrix/oscar/langs/et.txt +3 -0
imatrix/oscar/langs/eu.txt +3 -0
imatrix/oscar/langs/fi.txt +3 -0
imatrix/oscar/langs/fr.txt +3 -0
imatrix/oscar/langs/ga.txt +3 -0
imatrix/oscar/langs/gl.txt +3 -0
imatrix/oscar/langs/hr.txt +3 -0
imatrix/oscar/langs/hu.txt +3 -0
imatrix/oscar/langs/it.txt +3 -0
imatrix/oscar/langs/lt.txt +3 -0
imatrix/oscar/langs/lv.txt +3 -0
imatrix/oscar/langs/mt.txt +3 -0
imatrix/oscar/langs/nl.txt +3 -0
imatrix/oscar/langs/nn.txt +3 -0
imatrix/oscar/langs/no.txt +3 -0
imatrix/oscar/langs/oc.txt +3 -0
imatrix/oscar/langs/pl.txt +3 -0
imatrix/oscar/langs/pt.txt +3 -0

.gitattributes CHANGED Viewed

@@ -33,5 +33,28 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 images/salamandra_header.png filter=lfs diff=lfs merge=lfs -text
-tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+salamandra-2b_IQ4_XS.gguf filter=lfs diff=lfs merge=lfs -text
+salamandra-2b_Q3_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+salamandra-2b_Q4_K_S.gguf filter=lfs diff=lfs merge=lfs -text
+salamandra-2b_Q5_K_S.gguf filter=lfs diff=lfs merge=lfs -text
+salamandra-2b_IQ3_S.gguf filter=lfs diff=lfs merge=lfs -text
+salamandra-2b_Q3_K_L.gguf filter=lfs diff=lfs merge=lfs -text
+salamandra-2b_Q3_K_S.gguf filter=lfs diff=lfs merge=lfs -text
+salamandra-2b_Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+salamandra-2b_Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+salamandra-2b_Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
+salamandra-2b_IQ3_M.gguf filter=lfs diff=lfs merge=lfs -text
+salamandra-2b_IQ3_XS.gguf filter=lfs diff=lfs merge=lfs -text
+salamandra-2b_Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
+salamandra-2b_bf16.gguf filter=lfs diff=lfs merge=lfs -text
+salamandra-2b_IQ2_XS.gguf filter=lfs diff=lfs merge=lfs -text
+salamandra-2b_IQ3_XXS.gguf filter=lfs diff=lfs merge=lfs -text
+salamandra-2b_IQ4_NL.gguf filter=lfs diff=lfs merge=lfs -text
+imatrix/oscar/imatrix-dataset.txt filter=lfs diff=lfs merge=lfs -text
+imatrix/oscar/langs/bg.txt filter=lfs diff=lfs merge=lfs -text
+imatrix/oscar/langs/el.txt filter=lfs diff=lfs merge=lfs -text
+imatrix/oscar/langs/ru.txt filter=lfs diff=lfs merge=lfs -text
+imatrix/oscar/langs/uk.txt filter=lfs diff=lfs merge=lfs -text
+imatrix/oscar/langs/*.txt filter=lfs diff=lfs merge=lfs -text
 images/salamandra_header.png filter=lfs diff=lfs merge=lfs -text
+imatrix/oscar/imatrix.dat filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,191 @@

+.aider*
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+# Icon must end with two \r
+Icon
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

IQ2_XS_log.txt ADDED Viewed

	@@ -0,0 +1,339 @@

+main: build = 3906 (7eee341b)
+main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0
+main: quantizing 'salamandra-2b_bf16.gguf' to './salamandra-2b_IQ2_XS.gguf' as IQ2_XS
+llama_model_loader: loaded meta data with 29 key-value pairs and 219 tensors from salamandra-2b_bf16.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                         general.size_label str              = 2.3B
+llama_model_loader: - kv   3:                            general.license str              = apache-2.0
+llama_model_loader: - kv   4:                               general.tags arr[str,1]       = ["text-generation"]
+llama_model_loader: - kv   5:                          general.languages arr[str,36]      = ["bg", "ca", "code", "cs", "cy", "da"...
+llama_model_loader: - kv   6:                          llama.block_count u32              = 24
+llama_model_loader: - kv   7:                       llama.context_length u32              = 8192
+llama_model_loader: - kv   8:                     llama.embedding_length u32              = 2048
+llama_model_loader: - kv   9:                  llama.feed_forward_length u32              = 5440
+llama_model_loader: - kv  10:                 llama.attention.head_count u32              = 16
+llama_model_loader: - kv  11:              llama.attention.head_count_kv u32              = 16
+llama_model_loader: - kv  12:                       llama.rope.freq_base f32              = 10000.000000
+llama_model_loader: - kv  13:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  14:                          general.file_type u32              = 32
+llama_model_loader: - kv  15:                           llama.vocab_size u32              = 256000
+llama_model_loader: - kv  16:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv  17:            tokenizer.ggml.add_space_prefix bool             = true
+llama_model_loader: - kv  18:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  19:                         tokenizer.ggml.pre str              = default
+llama_model_loader: - kv  20:                      tokenizer.ggml.tokens arr[str,256000]  = ["<unk>", "<s>", "</s>", "<pad>", "<|...
+llama_model_loader: - kv  21:                      tokenizer.ggml.scores arr[f32,256000]  = [-1000.000000, -1000.000000, -1000.00...
+llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,256000]  = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
+llama_model_loader: - kv  23:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 2
+llama_model_loader: - kv  25:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  26:               tokenizer.ggml.add_bos_token bool             = true
+llama_model_loader: - kv  27:               tokenizer.ggml.add_eos_token bool             = false
+llama_model_loader: - kv  28:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   49 tensors
+llama_model_loader: - type bf16:  170 tensors
+================================ Have weights data with 168 entries
+[   1/ 219]                        output.weight - [ 2048, 256000,     1,     1], type =   bf16, size = 1000.000 MB
+[   2/ 219]                    token_embd.weight - [ 2048, 256000,     1,     1], type =   bf16,
+====== llama_model_quantize_internal: did not find weights for token_embd.weight
+converting to q2_K .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt'
+load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks
+prepare_imatrix: have 168 importance matrix entries
+size =  1000.00 MiB ->   164.06 MiB
+[   3/ 219]               blk.0.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   4/ 219]                blk.0.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q2_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[   5/ 219]                blk.0.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[   6/ 219]                  blk.0.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[   7/ 219]                blk.0.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   8/ 219]                  blk.0.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[   9/ 219]             blk.0.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  10/ 219]                  blk.0.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  11/ 219]                  blk.0.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[  12/ 219]               blk.1.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  13/ 219]                blk.1.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q2_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  14/ 219]                blk.1.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  15/ 219]                  blk.1.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  16/ 219]                blk.1.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  17/ 219]                  blk.1.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  18/ 219]             blk.1.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  19/ 219]                  blk.1.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  20/ 219]                  blk.1.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[  21/ 219]              blk.10.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  22/ 219]               blk.10.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q2_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  23/ 219]               blk.10.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  24/ 219]                 blk.10.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  25/ 219]               blk.10.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  26/ 219]                 blk.10.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  27/ 219]            blk.10.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  28/ 219]                 blk.10.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  29/ 219]                 blk.10.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[  30/ 219]              blk.11.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  31/ 219]               blk.11.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  32/ 219]               blk.11.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  33/ 219]                 blk.11.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  34/ 219]               blk.11.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  35/ 219]                 blk.11.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  36/ 219]            blk.11.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  37/ 219]                 blk.11.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  38/ 219]                 blk.11.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[  39/ 219]              blk.12.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  40/ 219]               blk.12.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  41/ 219]               blk.12.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  42/ 219]                 blk.12.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  43/ 219]               blk.12.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  44/ 219]                 blk.12.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  45/ 219]            blk.12.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  46/ 219]                 blk.12.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  47/ 219]                 blk.12.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[  48/ 219]              blk.13.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  49/ 219]               blk.13.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  50/ 219]               blk.13.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  51/ 219]                 blk.13.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  52/ 219]               blk.13.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  53/ 219]                 blk.13.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  54/ 219]            blk.13.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  55/ 219]                 blk.13.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  56/ 219]                 blk.13.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[  57/ 219]              blk.14.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  58/ 219]               blk.14.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  59/ 219]               blk.14.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  60/ 219]                 blk.14.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  61/ 219]               blk.14.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  62/ 219]                 blk.14.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  63/ 219]            blk.14.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  64/ 219]                 blk.14.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  65/ 219]                 blk.14.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[  66/ 219]              blk.15.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  67/ 219]               blk.15.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  68/ 219]               blk.15.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  69/ 219]                 blk.15.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  70/ 219]               blk.15.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  71/ 219]                 blk.15.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  72/ 219]            blk.15.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  73/ 219]                 blk.15.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  74/ 219]                 blk.15.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[  75/ 219]              blk.16.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  76/ 219]               blk.16.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  77/ 219]               blk.16.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  78/ 219]                 blk.16.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  79/ 219]               blk.16.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  80/ 219]                 blk.16.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  81/ 219]            blk.16.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  82/ 219]                 blk.16.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  83/ 219]                 blk.16.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[  84/ 219]              blk.17.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  85/ 219]               blk.17.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  86/ 219]               blk.17.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  87/ 219]                 blk.17.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  88/ 219]               blk.17.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  89/ 219]                 blk.17.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  90/ 219]            blk.17.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  91/ 219]                 blk.17.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  92/ 219]                 blk.17.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[  93/ 219]              blk.18.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  94/ 219]               blk.18.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  95/ 219]               blk.18.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  96/ 219]                 blk.18.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[  97/ 219]               blk.18.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  98/ 219]                 blk.18.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[  99/ 219]            blk.18.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 100/ 219]                 blk.18.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 101/ 219]                 blk.18.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[ 102/ 219]              blk.19.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 103/ 219]               blk.19.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 104/ 219]               blk.19.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 105/ 219]                 blk.19.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 106/ 219]               blk.19.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 107/ 219]                 blk.19.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 108/ 219]            blk.19.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 109/ 219]                 blk.19.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 110/ 219]                 blk.19.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[ 111/ 219]               blk.2.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 112/ 219]                blk.2.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 113/ 219]                blk.2.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 114/ 219]                  blk.2.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 115/ 219]                blk.2.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 116/ 219]                  blk.2.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 117/ 219]             blk.2.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 118/ 219]                  blk.2.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 119/ 219]                  blk.2.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[ 120/ 219]              blk.20.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 121/ 219]               blk.20.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 122/ 219]               blk.20.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 123/ 219]                 blk.20.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 124/ 219]               blk.20.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 125/ 219]                 blk.20.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 126/ 219]            blk.20.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 127/ 219]                 blk.20.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 128/ 219]                 blk.20.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[ 129/ 219]              blk.21.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 130/ 219]               blk.21.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 131/ 219]               blk.21.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 132/ 219]                 blk.21.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 133/ 219]               blk.21.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 134/ 219]                 blk.21.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 135/ 219]            blk.21.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 136/ 219]                 blk.21.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 137/ 219]                 blk.21.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[ 138/ 219]              blk.22.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 139/ 219]               blk.22.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 140/ 219]               blk.22.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 141/ 219]                 blk.22.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 142/ 219]               blk.22.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 143/ 219]                 blk.22.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 144/ 219]            blk.22.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 145/ 219]                 blk.22.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 146/ 219]                 blk.22.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[ 147/ 219]              blk.23.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 148/ 219]               blk.23.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 149/ 219]               blk.23.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 150/ 219]                 blk.23.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 151/ 219]               blk.23.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 152/ 219]                 blk.23.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 153/ 219]            blk.23.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 154/ 219]                 blk.23.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 155/ 219]                 blk.23.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[ 156/ 219]               blk.3.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 157/ 219]                blk.3.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 158/ 219]                blk.3.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 159/ 219]                  blk.3.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 160/ 219]                blk.3.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 161/ 219]                  blk.3.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 162/ 219]             blk.3.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 163/ 219]                  blk.3.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 164/ 219]                  blk.3.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[ 165/ 219]               blk.4.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 166/ 219]                blk.4.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 167/ 219]                blk.4.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 168/ 219]                  blk.4.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 169/ 219]                blk.4.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 170/ 219]                  blk.4.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 171/ 219]             blk.4.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 172/ 219]                  blk.4.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 173/ 219]                  blk.4.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[ 174/ 219]               blk.5.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 175/ 219]                blk.5.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 176/ 219]                blk.5.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 177/ 219]                  blk.5.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 178/ 219]                blk.5.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 179/ 219]                  blk.5.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 180/ 219]             blk.5.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 181/ 219]                  blk.5.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 182/ 219]                  blk.5.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[ 183/ 219]               blk.6.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 184/ 219]                blk.6.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 185/ 219]                blk.6.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 186/ 219]                  blk.6.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 187/ 219]                blk.6.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 188/ 219]                  blk.6.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 189/ 219]             blk.6.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 190/ 219]                  blk.6.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 191/ 219]                  blk.6.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[ 192/ 219]               blk.7.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 193/ 219]                blk.7.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 194/ 219]                blk.7.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 195/ 219]                  blk.7.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 196/ 219]                blk.7.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 197/ 219]                  blk.7.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 198/ 219]             blk.7.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 199/ 219]                  blk.7.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 200/ 219]                  blk.7.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[ 201/ 219]               blk.8.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 202/ 219]                blk.8.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 203/ 219]                blk.8.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 204/ 219]                  blk.8.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 205/ 219]                blk.8.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 206/ 219]                  blk.8.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 207/ 219]             blk.8.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 208/ 219]                  blk.8.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 209/ 219]                  blk.8.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[ 210/ 219]               blk.9.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 211/ 219]                blk.9.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq2_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 212/ 219]                blk.9.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 213/ 219]                  blk.9.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq2_xs .. size =    21.25 MiB ->     3.07 MiB
+[ 214/ 219]                blk.9.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 215/ 219]                  blk.9.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 216/ 219]             blk.9.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 217/ 219]                  blk.9.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_xs .. size =     8.00 MiB ->     1.16 MiB
+[ 218/ 219]                  blk.9.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB
+[ 219/ 219]                   output_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+llama_model_quantize_internal: model size  =  4298.38 MB
+llama_model_quantize_internal: quant size  =  1570.05 MB
+llama_model_quantize_internal: WARNING: 24 of 169 tensor(s) required fallback quantization
+main: quantize time = 33024.88 ms
+main:    total time = 33024.88 ms

IQ3_M_log.txt ADDED Viewed

	@@ -0,0 +1,339 @@

+main: build = 3906 (7eee341b)
+main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0
+main: quantizing 'salamandra-2b_bf16.gguf' to './salamandra-2b_IQ3_M.gguf' as IQ3_M
+llama_model_loader: loaded meta data with 29 key-value pairs and 219 tensors from salamandra-2b_bf16.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                         general.size_label str              = 2.3B
+llama_model_loader: - kv   3:                            general.license str              = apache-2.0
+llama_model_loader: - kv   4:                               general.tags arr[str,1]       = ["text-generation"]
+llama_model_loader: - kv   5:                          general.languages arr[str,36]      = ["bg", "ca", "code", "cs", "cy", "da"...
+llama_model_loader: - kv   6:                          llama.block_count u32              = 24
+llama_model_loader: - kv   7:                       llama.context_length u32              = 8192
+llama_model_loader: - kv   8:                     llama.embedding_length u32              = 2048
+llama_model_loader: - kv   9:                  llama.feed_forward_length u32              = 5440
+llama_model_loader: - kv  10:                 llama.attention.head_count u32              = 16
+llama_model_loader: - kv  11:              llama.attention.head_count_kv u32              = 16
+llama_model_loader: - kv  12:                       llama.rope.freq_base f32              = 10000.000000
+llama_model_loader: - kv  13:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  14:                          general.file_type u32              = 32
+llama_model_loader: - kv  15:                           llama.vocab_size u32              = 256000
+llama_model_loader: - kv  16:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv  17:            tokenizer.ggml.add_space_prefix bool             = true
+llama_model_loader: - kv  18:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  19:                         tokenizer.ggml.pre str              = default
+llama_model_loader: - kv  20:                      tokenizer.ggml.tokens arr[str,256000]  = ["<unk>", "<s>", "</s>", "<pad>", "<|...
+llama_model_loader: - kv  21:                      tokenizer.ggml.scores arr[f32,256000]  = [-1000.000000, -1000.000000, -1000.00...
+llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,256000]  = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
+llama_model_loader: - kv  23:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 2
+llama_model_loader: - kv  25:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  26:               tokenizer.ggml.add_bos_token bool             = true
+llama_model_loader: - kv  27:               tokenizer.ggml.add_eos_token bool             = false
+llama_model_loader: - kv  28:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   49 tensors
+llama_model_loader: - type bf16:  170 tensors
+================================ Have weights data with 168 entries
+[   1/ 219]                        output.weight - [ 2048, 256000,     1,     1], type =   bf16, size = 1000.000 MB
+[   2/ 219]                    token_embd.weight - [ 2048, 256000,     1,     1], type =   bf16,
+====== llama_model_quantize_internal: did not find weights for token_embd.weight
+converting to iq3_s .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt'
+load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks
+prepare_imatrix: have 168 importance matrix entries
+size =  1000.00 MiB ->   214.84 MiB
+[   3/ 219]               blk.0.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   4/ 219]                blk.0.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[   5/ 219]                blk.0.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[   6/ 219]                  blk.0.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[   7/ 219]                blk.0.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   8/ 219]                  blk.0.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[   9/ 219]             blk.0.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  10/ 219]                  blk.0.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  11/ 219]                  blk.0.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  12/ 219]               blk.1.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  13/ 219]                blk.1.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  14/ 219]                blk.1.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  15/ 219]                  blk.1.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  16/ 219]                blk.1.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  17/ 219]                  blk.1.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  18/ 219]             blk.1.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  19/ 219]                  blk.1.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  20/ 219]                  blk.1.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  21/ 219]              blk.10.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  22/ 219]               blk.10.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  23/ 219]               blk.10.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  24/ 219]                 blk.10.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  25/ 219]               blk.10.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  26/ 219]                 blk.10.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  27/ 219]            blk.10.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  28/ 219]                 blk.10.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  29/ 219]                 blk.10.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  30/ 219]              blk.11.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  31/ 219]               blk.11.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  32/ 219]               blk.11.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  33/ 219]                 blk.11.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  34/ 219]               blk.11.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  35/ 219]                 blk.11.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  36/ 219]            blk.11.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  37/ 219]                 blk.11.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  38/ 219]                 blk.11.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  39/ 219]              blk.12.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  40/ 219]               blk.12.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  41/ 219]               blk.12.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  42/ 219]                 blk.12.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  43/ 219]               blk.12.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  44/ 219]                 blk.12.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  45/ 219]            blk.12.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  46/ 219]                 blk.12.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  47/ 219]                 blk.12.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  48/ 219]              blk.13.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  49/ 219]               blk.13.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  50/ 219]               blk.13.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  51/ 219]                 blk.13.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  52/ 219]               blk.13.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  53/ 219]                 blk.13.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  54/ 219]            blk.13.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  55/ 219]                 blk.13.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  56/ 219]                 blk.13.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  57/ 219]              blk.14.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  58/ 219]               blk.14.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  59/ 219]               blk.14.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  60/ 219]                 blk.14.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  61/ 219]               blk.14.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  62/ 219]                 blk.14.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  63/ 219]            blk.14.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  64/ 219]                 blk.14.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  65/ 219]                 blk.14.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  66/ 219]              blk.15.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  67/ 219]               blk.15.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  68/ 219]               blk.15.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  69/ 219]                 blk.15.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  70/ 219]               blk.15.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  71/ 219]                 blk.15.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  72/ 219]            blk.15.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  73/ 219]                 blk.15.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  74/ 219]                 blk.15.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  75/ 219]              blk.16.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  76/ 219]               blk.16.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  77/ 219]               blk.16.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  78/ 219]                 blk.16.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  79/ 219]               blk.16.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  80/ 219]                 blk.16.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  81/ 219]            blk.16.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  82/ 219]                 blk.16.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  83/ 219]                 blk.16.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  84/ 219]              blk.17.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  85/ 219]               blk.17.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  86/ 219]               blk.17.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  87/ 219]                 blk.17.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  88/ 219]               blk.17.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  89/ 219]                 blk.17.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  90/ 219]            blk.17.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  91/ 219]                 blk.17.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  92/ 219]                 blk.17.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  93/ 219]              blk.18.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  94/ 219]               blk.18.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  95/ 219]               blk.18.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  96/ 219]                 blk.18.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  97/ 219]               blk.18.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  98/ 219]                 blk.18.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  99/ 219]            blk.18.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 100/ 219]                 blk.18.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 101/ 219]                 blk.18.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 102/ 219]              blk.19.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 103/ 219]               blk.19.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 104/ 219]               blk.19.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 105/ 219]                 blk.19.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 106/ 219]               blk.19.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 107/ 219]                 blk.19.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 108/ 219]            blk.19.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 109/ 219]                 blk.19.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 110/ 219]                 blk.19.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 111/ 219]               blk.2.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 112/ 219]                blk.2.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 113/ 219]                blk.2.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 114/ 219]                  blk.2.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 115/ 219]                blk.2.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 116/ 219]                  blk.2.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 117/ 219]             blk.2.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 118/ 219]                  blk.2.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 119/ 219]                  blk.2.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 120/ 219]              blk.20.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 121/ 219]               blk.20.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 122/ 219]               blk.20.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 123/ 219]                 blk.20.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 124/ 219]               blk.20.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 125/ 219]                 blk.20.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 126/ 219]            blk.20.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 127/ 219]                 blk.20.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 128/ 219]                 blk.20.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 129/ 219]              blk.21.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 130/ 219]               blk.21.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 131/ 219]               blk.21.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 132/ 219]                 blk.21.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 133/ 219]               blk.21.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 134/ 219]                 blk.21.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 135/ 219]            blk.21.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 136/ 219]                 blk.21.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 137/ 219]                 blk.21.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 138/ 219]              blk.22.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 139/ 219]               blk.22.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 140/ 219]               blk.22.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 141/ 219]                 blk.22.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 142/ 219]               blk.22.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 143/ 219]                 blk.22.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 144/ 219]            blk.22.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 145/ 219]                 blk.22.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 146/ 219]                 blk.22.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 147/ 219]              blk.23.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 148/ 219]               blk.23.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 149/ 219]               blk.23.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 150/ 219]                 blk.23.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 151/ 219]               blk.23.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 152/ 219]                 blk.23.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 153/ 219]            blk.23.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 154/ 219]                 blk.23.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 155/ 219]                 blk.23.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 156/ 219]               blk.3.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 157/ 219]                blk.3.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 158/ 219]                blk.3.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 159/ 219]                  blk.3.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 160/ 219]                blk.3.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 161/ 219]                  blk.3.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 162/ 219]             blk.3.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 163/ 219]                  blk.3.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 164/ 219]                  blk.3.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 165/ 219]               blk.4.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 166/ 219]                blk.4.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 167/ 219]                blk.4.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 168/ 219]                  blk.4.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 169/ 219]                blk.4.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 170/ 219]                  blk.4.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 171/ 219]             blk.4.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 172/ 219]                  blk.4.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 173/ 219]                  blk.4.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 174/ 219]               blk.5.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 175/ 219]                blk.5.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 176/ 219]                blk.5.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 177/ 219]                  blk.5.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 178/ 219]                blk.5.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 179/ 219]                  blk.5.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 180/ 219]             blk.5.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 181/ 219]                  blk.5.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 182/ 219]                  blk.5.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 183/ 219]               blk.6.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 184/ 219]                blk.6.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 185/ 219]                blk.6.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 186/ 219]                  blk.6.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 187/ 219]                blk.6.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 188/ 219]                  blk.6.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 189/ 219]             blk.6.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 190/ 219]                  blk.6.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 191/ 219]                  blk.6.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 192/ 219]               blk.7.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 193/ 219]                blk.7.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 194/ 219]                blk.7.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 195/ 219]                  blk.7.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 196/ 219]                blk.7.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 197/ 219]                  blk.7.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 198/ 219]             blk.7.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 199/ 219]                  blk.7.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 200/ 219]                  blk.7.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 201/ 219]               blk.8.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 202/ 219]                blk.8.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 203/ 219]                blk.8.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 204/ 219]                  blk.8.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 205/ 219]                blk.8.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 206/ 219]                  blk.8.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 207/ 219]             blk.8.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 208/ 219]                  blk.8.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 209/ 219]                  blk.8.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 210/ 219]               blk.9.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 211/ 219]                blk.9.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 212/ 219]                blk.9.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 213/ 219]                  blk.9.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 214/ 219]                blk.9.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 215/ 219]                  blk.9.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 216/ 219]             blk.9.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 217/ 219]                  blk.9.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 218/ 219]                  blk.9.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 219/ 219]                   output_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+llama_model_quantize_internal: model size  =  4298.38 MB
+llama_model_quantize_internal: quant size  =  1772.29 MB
+llama_model_quantize_internal: WARNING: 24 of 169 tensor(s) required fallback quantization
+main: quantize time = 20053.13 ms
+main:    total time = 20053.13 ms

IQ3_S_log.txt ADDED Viewed

	@@ -0,0 +1,339 @@

+main: build = 3906 (7eee341b)
+main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0
+main: quantizing 'salamandra-2b_bf16.gguf' to './salamandra-2b_IQ3_S.gguf' as IQ3_S
+llama_model_loader: loaded meta data with 29 key-value pairs and 219 tensors from salamandra-2b_bf16.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                         general.size_label str              = 2.3B
+llama_model_loader: - kv   3:                            general.license str              = apache-2.0
+llama_model_loader: - kv   4:                               general.tags arr[str,1]       = ["text-generation"]
+llama_model_loader: - kv   5:                          general.languages arr[str,36]      = ["bg", "ca", "code", "cs", "cy", "da"...
+llama_model_loader: - kv   6:                          llama.block_count u32              = 24
+llama_model_loader: - kv   7:                       llama.context_length u32              = 8192
+llama_model_loader: - kv   8:                     llama.embedding_length u32              = 2048
+llama_model_loader: - kv   9:                  llama.feed_forward_length u32              = 5440
+llama_model_loader: - kv  10:                 llama.attention.head_count u32              = 16
+llama_model_loader: - kv  11:              llama.attention.head_count_kv u32              = 16
+llama_model_loader: - kv  12:                       llama.rope.freq_base f32              = 10000.000000
+llama_model_loader: - kv  13:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  14:                          general.file_type u32              = 32
+llama_model_loader: - kv  15:                           llama.vocab_size u32              = 256000
+llama_model_loader: - kv  16:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv  17:            tokenizer.ggml.add_space_prefix bool             = true
+llama_model_loader: - kv  18:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  19:                         tokenizer.ggml.pre str              = default
+llama_model_loader: - kv  20:                      tokenizer.ggml.tokens arr[str,256000]  = ["<unk>", "<s>", "</s>", "<pad>", "<|...
+llama_model_loader: - kv  21:                      tokenizer.ggml.scores arr[f32,256000]  = [-1000.000000, -1000.000000, -1000.00...
+llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,256000]  = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
+llama_model_loader: - kv  23:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 2
+llama_model_loader: - kv  25:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  26:               tokenizer.ggml.add_bos_token bool             = true
+llama_model_loader: - kv  27:               tokenizer.ggml.add_eos_token bool             = false
+llama_model_loader: - kv  28:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   49 tensors
+llama_model_loader: - type bf16:  170 tensors
+================================ Have weights data with 168 entries
+[   1/ 219]                        output.weight - [ 2048, 256000,     1,     1], type =   bf16, size = 1000.000 MB
+[   2/ 219]                    token_embd.weight - [ 2048, 256000,     1,     1], type =   bf16,
+====== llama_model_quantize_internal: did not find weights for token_embd.weight
+converting to iq3_s .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt'
+load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks
+prepare_imatrix: have 168 importance matrix entries
+size =  1000.00 MiB ->   214.84 MiB
+[   3/ 219]               blk.0.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   4/ 219]                blk.0.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[   5/ 219]                blk.0.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[   6/ 219]                  blk.0.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[   7/ 219]                blk.0.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   8/ 219]                  blk.0.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[   9/ 219]             blk.0.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  10/ 219]                  blk.0.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  11/ 219]                  blk.0.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  12/ 219]               blk.1.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  13/ 219]                blk.1.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  14/ 219]                blk.1.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  15/ 219]                  blk.1.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  16/ 219]                blk.1.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  17/ 219]                  blk.1.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  18/ 219]             blk.1.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  19/ 219]                  blk.1.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  20/ 219]                  blk.1.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  21/ 219]              blk.10.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  22/ 219]               blk.10.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  23/ 219]               blk.10.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  24/ 219]                 blk.10.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  25/ 219]               blk.10.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  26/ 219]                 blk.10.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  27/ 219]            blk.10.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  28/ 219]                 blk.10.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  29/ 219]                 blk.10.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  30/ 219]              blk.11.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  31/ 219]               blk.11.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  32/ 219]               blk.11.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  33/ 219]                 blk.11.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  34/ 219]               blk.11.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  35/ 219]                 blk.11.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  36/ 219]            blk.11.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  37/ 219]                 blk.11.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  38/ 219]                 blk.11.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  39/ 219]              blk.12.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  40/ 219]               blk.12.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  41/ 219]               blk.12.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  42/ 219]                 blk.12.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  43/ 219]               blk.12.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  44/ 219]                 blk.12.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  45/ 219]            blk.12.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  46/ 219]                 blk.12.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  47/ 219]                 blk.12.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  48/ 219]              blk.13.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  49/ 219]               blk.13.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  50/ 219]               blk.13.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  51/ 219]                 blk.13.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  52/ 219]               blk.13.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  53/ 219]                 blk.13.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  54/ 219]            blk.13.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  55/ 219]                 blk.13.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  56/ 219]                 blk.13.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  57/ 219]              blk.14.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  58/ 219]               blk.14.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  59/ 219]               blk.14.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  60/ 219]                 blk.14.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  61/ 219]               blk.14.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  62/ 219]                 blk.14.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  63/ 219]            blk.14.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  64/ 219]                 blk.14.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  65/ 219]                 blk.14.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  66/ 219]              blk.15.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  67/ 219]               blk.15.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  68/ 219]               blk.15.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  69/ 219]                 blk.15.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  70/ 219]               blk.15.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  71/ 219]                 blk.15.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  72/ 219]            blk.15.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  73/ 219]                 blk.15.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  74/ 219]                 blk.15.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  75/ 219]              blk.16.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  76/ 219]               blk.16.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  77/ 219]               blk.16.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  78/ 219]                 blk.16.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  79/ 219]               blk.16.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  80/ 219]                 blk.16.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  81/ 219]            blk.16.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  82/ 219]                 blk.16.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  83/ 219]                 blk.16.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  84/ 219]              blk.17.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  85/ 219]               blk.17.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  86/ 219]               blk.17.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  87/ 219]                 blk.17.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  88/ 219]               blk.17.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  89/ 219]                 blk.17.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  90/ 219]            blk.17.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  91/ 219]                 blk.17.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  92/ 219]                 blk.17.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  93/ 219]              blk.18.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  94/ 219]               blk.18.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  95/ 219]               blk.18.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  96/ 219]                 blk.18.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  97/ 219]               blk.18.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  98/ 219]                 blk.18.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  99/ 219]            blk.18.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 100/ 219]                 blk.18.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 101/ 219]                 blk.18.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 102/ 219]              blk.19.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 103/ 219]               blk.19.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 104/ 219]               blk.19.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 105/ 219]                 blk.19.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 106/ 219]               blk.19.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 107/ 219]                 blk.19.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 108/ 219]            blk.19.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 109/ 219]                 blk.19.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 110/ 219]                 blk.19.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 111/ 219]               blk.2.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 112/ 219]                blk.2.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 113/ 219]                blk.2.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 114/ 219]                  blk.2.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 115/ 219]                blk.2.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 116/ 219]                  blk.2.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 117/ 219]             blk.2.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 118/ 219]                  blk.2.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 119/ 219]                  blk.2.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 120/ 219]              blk.20.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 121/ 219]               blk.20.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 122/ 219]               blk.20.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 123/ 219]                 blk.20.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 124/ 219]               blk.20.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 125/ 219]                 blk.20.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 126/ 219]            blk.20.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 127/ 219]                 blk.20.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 128/ 219]                 blk.20.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 129/ 219]              blk.21.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 130/ 219]               blk.21.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 131/ 219]               blk.21.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 132/ 219]                 blk.21.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 133/ 219]               blk.21.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 134/ 219]                 blk.21.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 135/ 219]            blk.21.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 136/ 219]                 blk.21.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 137/ 219]                 blk.21.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 138/ 219]              blk.22.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 139/ 219]               blk.22.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 140/ 219]               blk.22.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 141/ 219]                 blk.22.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 142/ 219]               blk.22.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 143/ 219]                 blk.22.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 144/ 219]            blk.22.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 145/ 219]                 blk.22.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 146/ 219]                 blk.22.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 147/ 219]              blk.23.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 148/ 219]               blk.23.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 149/ 219]               blk.23.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 150/ 219]                 blk.23.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 151/ 219]               blk.23.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 152/ 219]                 blk.23.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 153/ 219]            blk.23.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 154/ 219]                 blk.23.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 155/ 219]                 blk.23.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 156/ 219]               blk.3.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 157/ 219]                blk.3.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 158/ 219]                blk.3.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 159/ 219]                  blk.3.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 160/ 219]                blk.3.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 161/ 219]                  blk.3.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 162/ 219]             blk.3.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 163/ 219]                  blk.3.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 164/ 219]                  blk.3.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 165/ 219]               blk.4.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 166/ 219]                blk.4.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 167/ 219]                blk.4.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 168/ 219]                  blk.4.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 169/ 219]                blk.4.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 170/ 219]                  blk.4.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 171/ 219]             blk.4.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 172/ 219]                  blk.4.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 173/ 219]                  blk.4.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 174/ 219]               blk.5.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 175/ 219]                blk.5.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 176/ 219]                blk.5.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 177/ 219]                  blk.5.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 178/ 219]                blk.5.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 179/ 219]                  blk.5.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 180/ 219]             blk.5.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 181/ 219]                  blk.5.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 182/ 219]                  blk.5.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 183/ 219]               blk.6.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 184/ 219]                blk.6.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 185/ 219]                blk.6.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 186/ 219]                  blk.6.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 187/ 219]                blk.6.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 188/ 219]                  blk.6.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 189/ 219]             blk.6.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 190/ 219]                  blk.6.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 191/ 219]                  blk.6.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 192/ 219]               blk.7.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 193/ 219]                blk.7.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 194/ 219]                blk.7.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 195/ 219]                  blk.7.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 196/ 219]                blk.7.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 197/ 219]                  blk.7.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 198/ 219]             blk.7.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 199/ 219]                  blk.7.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 200/ 219]                  blk.7.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 201/ 219]               blk.8.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 202/ 219]                blk.8.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 203/ 219]                blk.8.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 204/ 219]                  blk.8.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 205/ 219]                blk.8.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 206/ 219]                  blk.8.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 207/ 219]             blk.8.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 208/ 219]                  blk.8.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 209/ 219]                  blk.8.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 210/ 219]               blk.9.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 211/ 219]                blk.9.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 212/ 219]                blk.9.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 213/ 219]                  blk.9.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 214/ 219]                blk.9.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 215/ 219]                  blk.9.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 216/ 219]             blk.9.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 217/ 219]                  blk.9.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 218/ 219]                  blk.9.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 219/ 219]                   output_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+llama_model_quantize_internal: model size  =  4298.38 MB
+llama_model_quantize_internal: quant size  =  1742.80 MB
+llama_model_quantize_internal: WARNING: 24 of 169 tensor(s) required fallback quantization
+main: quantize time = 21645.04 ms
+main:    total time = 21645.04 ms

IQ3_XS_log.txt ADDED Viewed

	@@ -0,0 +1,339 @@

+main: build = 3906 (7eee341b)
+main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0
+main: quantizing 'salamandra-2b_bf16.gguf' to './salamandra-2b_IQ3_XS.gguf' as IQ3_XS
+llama_model_loader: loaded meta data with 29 key-value pairs and 219 tensors from salamandra-2b_bf16.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                         general.size_label str              = 2.3B
+llama_model_loader: - kv   3:                            general.license str              = apache-2.0
+llama_model_loader: - kv   4:                               general.tags arr[str,1]       = ["text-generation"]
+llama_model_loader: - kv   5:                          general.languages arr[str,36]      = ["bg", "ca", "code", "cs", "cy", "da"...
+llama_model_loader: - kv   6:                          llama.block_count u32              = 24
+llama_model_loader: - kv   7:                       llama.context_length u32              = 8192
+llama_model_loader: - kv   8:                     llama.embedding_length u32              = 2048
+llama_model_loader: - kv   9:                  llama.feed_forward_length u32              = 5440
+llama_model_loader: - kv  10:                 llama.attention.head_count u32              = 16
+llama_model_loader: - kv  11:              llama.attention.head_count_kv u32              = 16
+llama_model_loader: - kv  12:                       llama.rope.freq_base f32              = 10000.000000
+llama_model_loader: - kv  13:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  14:                          general.file_type u32              = 32
+llama_model_loader: - kv  15:                           llama.vocab_size u32              = 256000
+llama_model_loader: - kv  16:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv  17:            tokenizer.ggml.add_space_prefix bool             = true
+llama_model_loader: - kv  18:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  19:                         tokenizer.ggml.pre str              = default
+llama_model_loader: - kv  20:                      tokenizer.ggml.tokens arr[str,256000]  = ["<unk>", "<s>", "</s>", "<pad>", "<|...
+llama_model_loader: - kv  21:                      tokenizer.ggml.scores arr[f32,256000]  = [-1000.000000, -1000.000000, -1000.00...
+llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,256000]  = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
+llama_model_loader: - kv  23:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 2
+llama_model_loader: - kv  25:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  26:               tokenizer.ggml.add_bos_token bool             = true
+llama_model_loader: - kv  27:               tokenizer.ggml.add_eos_token bool             = false
+llama_model_loader: - kv  28:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   49 tensors
+llama_model_loader: - type bf16:  170 tensors
+================================ Have weights data with 168 entries
+[   1/ 219]                        output.weight - [ 2048, 256000,     1,     1], type =   bf16, size = 1000.000 MB
+[   2/ 219]                    token_embd.weight - [ 2048, 256000,     1,     1], type =   bf16,
+====== llama_model_quantize_internal: did not find weights for token_embd.weight
+converting to iq3_s .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt'
+load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks
+prepare_imatrix: have 168 importance matrix entries
+size =  1000.00 MiB ->   214.84 MiB
+[   3/ 219]               blk.0.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   4/ 219]                blk.0.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[   5/ 219]                blk.0.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[   6/ 219]                  blk.0.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[   7/ 219]                blk.0.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   8/ 219]                  blk.0.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[   9/ 219]             blk.0.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  10/ 219]                  blk.0.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  11/ 219]                  blk.0.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  12/ 219]               blk.1.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  13/ 219]                blk.1.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  14/ 219]                blk.1.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  15/ 219]                  blk.1.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  16/ 219]                blk.1.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  17/ 219]                  blk.1.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  18/ 219]             blk.1.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  19/ 219]                  blk.1.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  20/ 219]                  blk.1.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  21/ 219]              blk.10.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  22/ 219]               blk.10.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  23/ 219]               blk.10.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  24/ 219]                 blk.10.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[  25/ 219]               blk.10.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  26/ 219]                 blk.10.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  27/ 219]            blk.10.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  28/ 219]                 blk.10.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  29/ 219]                 blk.10.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  30/ 219]              blk.11.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  31/ 219]               blk.11.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  32/ 219]               blk.11.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  33/ 219]                 blk.11.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  34/ 219]               blk.11.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  35/ 219]                 blk.11.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  36/ 219]            blk.11.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  37/ 219]                 blk.11.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  38/ 219]                 blk.11.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  39/ 219]              blk.12.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  40/ 219]               blk.12.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  41/ 219]               blk.12.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  42/ 219]                 blk.12.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  43/ 219]               blk.12.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  44/ 219]                 blk.12.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  45/ 219]            blk.12.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  46/ 219]                 blk.12.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  47/ 219]                 blk.12.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  48/ 219]              blk.13.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  49/ 219]               blk.13.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  50/ 219]               blk.13.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  51/ 219]                 blk.13.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  52/ 219]               blk.13.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  53/ 219]                 blk.13.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  54/ 219]            blk.13.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  55/ 219]                 blk.13.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  56/ 219]                 blk.13.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  57/ 219]              blk.14.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  58/ 219]               blk.14.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  59/ 219]               blk.14.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  60/ 219]                 blk.14.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  61/ 219]               blk.14.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  62/ 219]                 blk.14.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  63/ 219]            blk.14.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  64/ 219]                 blk.14.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  65/ 219]                 blk.14.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  66/ 219]              blk.15.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  67/ 219]               blk.15.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  68/ 219]               blk.15.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  69/ 219]                 blk.15.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  70/ 219]               blk.15.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  71/ 219]                 blk.15.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  72/ 219]            blk.15.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  73/ 219]                 blk.15.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  74/ 219]                 blk.15.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  75/ 219]              blk.16.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  76/ 219]               blk.16.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  77/ 219]               blk.16.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  78/ 219]                 blk.16.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  79/ 219]               blk.16.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  80/ 219]                 blk.16.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  81/ 219]            blk.16.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  82/ 219]                 blk.16.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  83/ 219]                 blk.16.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  84/ 219]              blk.17.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  85/ 219]               blk.17.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  86/ 219]               blk.17.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  87/ 219]                 blk.17.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  88/ 219]               blk.17.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  89/ 219]                 blk.17.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  90/ 219]            blk.17.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  91/ 219]                 blk.17.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  92/ 219]                 blk.17.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  93/ 219]              blk.18.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  94/ 219]               blk.18.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  95/ 219]               blk.18.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  96/ 219]                 blk.18.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  97/ 219]               blk.18.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  98/ 219]                 blk.18.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  99/ 219]            blk.18.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 100/ 219]                 blk.18.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 101/ 219]                 blk.18.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 102/ 219]              blk.19.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 103/ 219]               blk.19.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 104/ 219]               blk.19.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 105/ 219]                 blk.19.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 106/ 219]               blk.19.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 107/ 219]                 blk.19.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 108/ 219]            blk.19.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 109/ 219]                 blk.19.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 110/ 219]                 blk.19.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 111/ 219]               blk.2.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 112/ 219]                blk.2.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 113/ 219]                blk.2.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 114/ 219]                  blk.2.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 115/ 219]                blk.2.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 116/ 219]                  blk.2.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 117/ 219]             blk.2.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 118/ 219]                  blk.2.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 119/ 219]                  blk.2.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 120/ 219]              blk.20.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 121/ 219]               blk.20.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 122/ 219]               blk.20.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 123/ 219]                 blk.20.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 124/ 219]               blk.20.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 125/ 219]                 blk.20.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 126/ 219]            blk.20.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 127/ 219]                 blk.20.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 128/ 219]                 blk.20.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 129/ 219]              blk.21.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 130/ 219]               blk.21.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 131/ 219]               blk.21.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 132/ 219]                 blk.21.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 133/ 219]               blk.21.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 134/ 219]                 blk.21.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 135/ 219]            blk.21.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 136/ 219]                 blk.21.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 137/ 219]                 blk.21.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 138/ 219]              blk.22.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 139/ 219]               blk.22.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 140/ 219]               blk.22.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 141/ 219]                 blk.22.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 142/ 219]               blk.22.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 143/ 219]                 blk.22.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 144/ 219]            blk.22.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 145/ 219]                 blk.22.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 146/ 219]                 blk.22.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 147/ 219]              blk.23.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 148/ 219]               blk.23.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 149/ 219]               blk.23.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 150/ 219]                 blk.23.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 151/ 219]               blk.23.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 152/ 219]                 blk.23.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 153/ 219]            blk.23.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 154/ 219]                 blk.23.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 155/ 219]                 blk.23.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 156/ 219]               blk.3.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 157/ 219]                blk.3.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 158/ 219]                blk.3.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 159/ 219]                  blk.3.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 160/ 219]                blk.3.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 161/ 219]                  blk.3.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 162/ 219]             blk.3.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 163/ 219]                  blk.3.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 164/ 219]                  blk.3.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 165/ 219]               blk.4.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 166/ 219]                blk.4.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 167/ 219]                blk.4.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 168/ 219]                  blk.4.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 169/ 219]                blk.4.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 170/ 219]                  blk.4.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 171/ 219]             blk.4.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 172/ 219]                  blk.4.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 173/ 219]                  blk.4.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 174/ 219]               blk.5.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 175/ 219]                blk.5.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 176/ 219]                blk.5.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 177/ 219]                  blk.5.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 178/ 219]                blk.5.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 179/ 219]                  blk.5.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 180/ 219]             blk.5.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 181/ 219]                  blk.5.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 182/ 219]                  blk.5.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 183/ 219]               blk.6.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 184/ 219]                blk.6.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 185/ 219]                blk.6.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 186/ 219]                  blk.6.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 187/ 219]                blk.6.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 188/ 219]                  blk.6.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 189/ 219]             blk.6.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 190/ 219]                  blk.6.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 191/ 219]                  blk.6.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 192/ 219]               blk.7.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 193/ 219]                blk.7.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 194/ 219]                blk.7.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 195/ 219]                  blk.7.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 196/ 219]                blk.7.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 197/ 219]                  blk.7.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 198/ 219]             blk.7.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 199/ 219]                  blk.7.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 200/ 219]                  blk.7.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 201/ 219]               blk.8.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 202/ 219]                blk.8.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 203/ 219]                blk.8.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 204/ 219]                  blk.8.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 205/ 219]                blk.8.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 206/ 219]                  blk.8.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 207/ 219]             blk.8.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 208/ 219]                  blk.8.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 209/ 219]                  blk.8.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 210/ 219]               blk.9.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 211/ 219]                blk.9.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_s - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 212/ 219]                blk.9.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 213/ 219]                  blk.9.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_s .. size =    21.25 MiB ->     4.57 MiB
+[ 214/ 219]                blk.9.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 215/ 219]                  blk.9.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 216/ 219]             blk.9.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 217/ 219]                  blk.9.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 218/ 219]                  blk.9.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 219/ 219]                   output_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+llama_model_quantize_internal: model size  =  4298.38 MB
+llama_model_quantize_internal: quant size  =  1715.88 MB
+llama_model_quantize_internal: WARNING: 24 of 169 tensor(s) required fallback quantization
+main: quantize time = 25070.00 ms
+main:    total time = 25070.00 ms

IQ3_XXS_log.txt ADDED Viewed

	@@ -0,0 +1,339 @@

+main: build = 3906 (7eee341b)
+main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0
+main: quantizing 'salamandra-2b_bf16.gguf' to './salamandra-2b_IQ3_XXS.gguf' as IQ3_XXS
+llama_model_loader: loaded meta data with 29 key-value pairs and 219 tensors from salamandra-2b_bf16.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                         general.size_label str              = 2.3B
+llama_model_loader: - kv   3:                            general.license str              = apache-2.0
+llama_model_loader: - kv   4:                               general.tags arr[str,1]       = ["text-generation"]
+llama_model_loader: - kv   5:                          general.languages arr[str,36]      = ["bg", "ca", "code", "cs", "cy", "da"...
+llama_model_loader: - kv   6:                          llama.block_count u32              = 24
+llama_model_loader: - kv   7:                       llama.context_length u32              = 8192
+llama_model_loader: - kv   8:                     llama.embedding_length u32              = 2048
+llama_model_loader: - kv   9:                  llama.feed_forward_length u32              = 5440
+llama_model_loader: - kv  10:                 llama.attention.head_count u32              = 16
+llama_model_loader: - kv  11:              llama.attention.head_count_kv u32              = 16
+llama_model_loader: - kv  12:                       llama.rope.freq_base f32              = 10000.000000
+llama_model_loader: - kv  13:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  14:                          general.file_type u32              = 32
+llama_model_loader: - kv  15:                           llama.vocab_size u32              = 256000
+llama_model_loader: - kv  16:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv  17:            tokenizer.ggml.add_space_prefix bool             = true
+llama_model_loader: - kv  18:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  19:                         tokenizer.ggml.pre str              = default
+llama_model_loader: - kv  20:                      tokenizer.ggml.tokens arr[str,256000]  = ["<unk>", "<s>", "</s>", "<pad>", "<|...
+llama_model_loader: - kv  21:                      tokenizer.ggml.scores arr[f32,256000]  = [-1000.000000, -1000.000000, -1000.00...
+llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,256000]  = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
+llama_model_loader: - kv  23:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 2
+llama_model_loader: - kv  25:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  26:               tokenizer.ggml.add_bos_token bool             = true
+llama_model_loader: - kv  27:               tokenizer.ggml.add_eos_token bool             = false
+llama_model_loader: - kv  28:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   49 tensors
+llama_model_loader: - type bf16:  170 tensors
+================================ Have weights data with 168 entries
+[   1/ 219]                        output.weight - [ 2048, 256000,     1,     1], type =   bf16, size = 1000.000 MB
+[   2/ 219]                    token_embd.weight - [ 2048, 256000,     1,     1], type =   bf16,
+====== llama_model_quantize_internal: did not find weights for token_embd.weight
+converting to iq3_s .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt'
+load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks
+prepare_imatrix: have 168 importance matrix entries
+size =  1000.00 MiB ->   214.84 MiB
+[   3/ 219]               blk.0.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   4/ 219]                blk.0.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[   5/ 219]                blk.0.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[   6/ 219]                  blk.0.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[   7/ 219]                blk.0.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   8/ 219]                  blk.0.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[   9/ 219]             blk.0.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  10/ 219]                  blk.0.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  11/ 219]                  blk.0.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  12/ 219]               blk.1.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  13/ 219]                blk.1.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  14/ 219]                blk.1.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  15/ 219]                  blk.1.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  16/ 219]                blk.1.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  17/ 219]                  blk.1.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  18/ 219]             blk.1.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  19/ 219]                  blk.1.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  20/ 219]                  blk.1.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  21/ 219]              blk.10.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  22/ 219]               blk.10.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  23/ 219]               blk.10.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  24/ 219]                 blk.10.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  25/ 219]               blk.10.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  26/ 219]                 blk.10.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  27/ 219]            blk.10.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  28/ 219]                 blk.10.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  29/ 219]                 blk.10.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  30/ 219]              blk.11.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  31/ 219]               blk.11.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  32/ 219]               blk.11.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  33/ 219]                 blk.11.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  34/ 219]               blk.11.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  35/ 219]                 blk.11.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  36/ 219]            blk.11.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  37/ 219]                 blk.11.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  38/ 219]                 blk.11.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  39/ 219]              blk.12.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  40/ 219]               blk.12.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  41/ 219]               blk.12.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  42/ 219]                 blk.12.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  43/ 219]               blk.12.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  44/ 219]                 blk.12.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  45/ 219]            blk.12.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  46/ 219]                 blk.12.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  47/ 219]                 blk.12.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  48/ 219]              blk.13.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  49/ 219]               blk.13.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  50/ 219]               blk.13.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  51/ 219]                 blk.13.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  52/ 219]               blk.13.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  53/ 219]                 blk.13.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  54/ 219]            blk.13.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  55/ 219]                 blk.13.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  56/ 219]                 blk.13.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  57/ 219]              blk.14.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  58/ 219]               blk.14.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  59/ 219]               blk.14.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  60/ 219]                 blk.14.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  61/ 219]               blk.14.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  62/ 219]                 blk.14.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  63/ 219]            blk.14.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  64/ 219]                 blk.14.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  65/ 219]                 blk.14.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  66/ 219]              blk.15.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  67/ 219]               blk.15.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  68/ 219]               blk.15.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  69/ 219]                 blk.15.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  70/ 219]               blk.15.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  71/ 219]                 blk.15.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  72/ 219]            blk.15.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  73/ 219]                 blk.15.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  74/ 219]                 blk.15.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  75/ 219]              blk.16.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  76/ 219]               blk.16.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  77/ 219]               blk.16.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  78/ 219]                 blk.16.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  79/ 219]               blk.16.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  80/ 219]                 blk.16.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  81/ 219]            blk.16.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  82/ 219]                 blk.16.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  83/ 219]                 blk.16.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  84/ 219]              blk.17.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  85/ 219]               blk.17.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  86/ 219]               blk.17.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  87/ 219]                 blk.17.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  88/ 219]               blk.17.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  89/ 219]                 blk.17.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  90/ 219]            blk.17.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[  91/ 219]                 blk.17.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  92/ 219]                 blk.17.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[  93/ 219]              blk.18.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  94/ 219]               blk.18.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  95/ 219]               blk.18.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  96/ 219]                 blk.18.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[  97/ 219]               blk.18.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  98/ 219]                 blk.18.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[  99/ 219]            blk.18.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 100/ 219]                 blk.18.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 101/ 219]                 blk.18.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 102/ 219]              blk.19.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 103/ 219]               blk.19.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 104/ 219]               blk.19.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 105/ 219]                 blk.19.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 106/ 219]               blk.19.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 107/ 219]                 blk.19.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 108/ 219]            blk.19.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 109/ 219]                 blk.19.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 110/ 219]                 blk.19.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 111/ 219]               blk.2.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 112/ 219]                blk.2.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 113/ 219]                blk.2.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 114/ 219]                  blk.2.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 115/ 219]                blk.2.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 116/ 219]                  blk.2.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 117/ 219]             blk.2.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 118/ 219]                  blk.2.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 119/ 219]                  blk.2.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 120/ 219]              blk.20.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 121/ 219]               blk.20.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 122/ 219]               blk.20.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 123/ 219]                 blk.20.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 124/ 219]               blk.20.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 125/ 219]                 blk.20.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 126/ 219]            blk.20.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 127/ 219]                 blk.20.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 128/ 219]                 blk.20.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 129/ 219]              blk.21.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 130/ 219]               blk.21.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 131/ 219]               blk.21.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 132/ 219]                 blk.21.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 133/ 219]               blk.21.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 134/ 219]                 blk.21.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 135/ 219]            blk.21.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 136/ 219]                 blk.21.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 137/ 219]                 blk.21.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 138/ 219]              blk.22.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 139/ 219]               blk.22.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 140/ 219]               blk.22.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 141/ 219]                 blk.22.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 142/ 219]               blk.22.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 143/ 219]                 blk.22.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 144/ 219]            blk.22.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 145/ 219]                 blk.22.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 146/ 219]                 blk.22.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 147/ 219]              blk.23.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 148/ 219]               blk.23.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 149/ 219]               blk.23.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 150/ 219]                 blk.23.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 151/ 219]               blk.23.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 152/ 219]                 blk.23.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 153/ 219]            blk.23.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 154/ 219]                 blk.23.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 155/ 219]                 blk.23.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 156/ 219]               blk.3.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 157/ 219]                blk.3.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 158/ 219]                blk.3.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 159/ 219]                  blk.3.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 160/ 219]                blk.3.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 161/ 219]                  blk.3.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 162/ 219]             blk.3.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 163/ 219]                  blk.3.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 164/ 219]                  blk.3.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 165/ 219]               blk.4.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 166/ 219]                blk.4.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 167/ 219]                blk.4.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 168/ 219]                  blk.4.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 169/ 219]                blk.4.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 170/ 219]                  blk.4.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 171/ 219]             blk.4.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 172/ 219]                  blk.4.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 173/ 219]                  blk.4.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 174/ 219]               blk.5.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 175/ 219]                blk.5.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 176/ 219]                blk.5.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 177/ 219]                  blk.5.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 178/ 219]                blk.5.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 179/ 219]                  blk.5.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 180/ 219]             blk.5.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 181/ 219]                  blk.5.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 182/ 219]                  blk.5.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 183/ 219]               blk.6.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 184/ 219]                blk.6.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 185/ 219]                blk.6.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 186/ 219]                  blk.6.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 187/ 219]                blk.6.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 188/ 219]                  blk.6.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 189/ 219]             blk.6.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 190/ 219]                  blk.6.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 191/ 219]                  blk.6.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 192/ 219]               blk.7.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 193/ 219]                blk.7.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 194/ 219]                blk.7.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 195/ 219]                  blk.7.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 196/ 219]                blk.7.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 197/ 219]                  blk.7.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 198/ 219]             blk.7.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 199/ 219]                  blk.7.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 200/ 219]                  blk.7.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 201/ 219]               blk.8.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 202/ 219]                blk.8.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 203/ 219]                blk.8.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 204/ 219]                  blk.8.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 205/ 219]                blk.8.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 206/ 219]                  blk.8.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 207/ 219]             blk.8.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 208/ 219]                  blk.8.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 209/ 219]                  blk.8.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 210/ 219]               blk.9.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 211/ 219]                blk.9.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 212/ 219]                blk.9.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 213/ 219]                  blk.9.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq3_xxs .. size =    21.25 MiB ->     4.07 MiB
+[ 214/ 219]                blk.9.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 215/ 219]                  blk.9.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 216/ 219]             blk.9.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_s .. size =     8.00 MiB ->     1.72 MiB
+[ 217/ 219]                  blk.9.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq2_s .. size =     8.00 MiB ->     1.28 MiB
+[ 218/ 219]                  blk.9.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq3_xxs .. size =     8.00 MiB ->     1.53 MiB
+[ 219/ 219]                   output_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+llama_model_quantize_internal: model size  =  4298.38 MB
+llama_model_quantize_internal: quant size  =  1693.40 MB
+llama_model_quantize_internal: WARNING: 24 of 169 tensor(s) required fallback quantization
+main: quantize time = 28893.81 ms
+main:    total time = 28893.81 ms

IQ4_NL_log.txt ADDED Viewed

	@@ -0,0 +1,266 @@

+main: build = 3906 (7eee341b)
+main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0
+main: quantizing 'salamandra-2b_bf16.gguf' to './salamandra-2b_IQ4_NL.gguf' as IQ4_NL
+llama_model_loader: loaded meta data with 29 key-value pairs and 219 tensors from salamandra-2b_bf16.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                         general.size_label str              = 2.3B
+llama_model_loader: - kv   3:                            general.license str              = apache-2.0
+llama_model_loader: - kv   4:                               general.tags arr[str,1]       = ["text-generation"]
+llama_model_loader: - kv   5:                          general.languages arr[str,36]      = ["bg", "ca", "code", "cs", "cy", "da"...
+llama_model_loader: - kv   6:                          llama.block_count u32              = 24
+llama_model_loader: - kv   7:                       llama.context_length u32              = 8192
+llama_model_loader: - kv   8:                     llama.embedding_length u32              = 2048
+llama_model_loader: - kv   9:                  llama.feed_forward_length u32              = 5440
+llama_model_loader: - kv  10:                 llama.attention.head_count u32              = 16
+llama_model_loader: - kv  11:              llama.attention.head_count_kv u32              = 16
+llama_model_loader: - kv  12:                       llama.rope.freq_base f32              = 10000.000000
+llama_model_loader: - kv  13:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  14:                          general.file_type u32              = 32
+llama_model_loader: - kv  15:                           llama.vocab_size u32              = 256000
+llama_model_loader: - kv  16:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv  17:            tokenizer.ggml.add_space_prefix bool             = true
+llama_model_loader: - kv  18:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  19:                         tokenizer.ggml.pre str              = default
+llama_model_loader: - kv  20:                      tokenizer.ggml.tokens arr[str,256000]  = ["<unk>", "<s>", "</s>", "<pad>", "<|...
+llama_model_loader: - kv  21:                      tokenizer.ggml.scores arr[f32,256000]  = [-1000.000000, -1000.000000, -1000.00...
+llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,256000]  = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
+llama_model_loader: - kv  23:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 2
+llama_model_loader: - kv  25:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  26:               tokenizer.ggml.add_bos_token bool             = true
+llama_model_loader: - kv  27:               tokenizer.ggml.add_eos_token bool             = false
+llama_model_loader: - kv  28:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   49 tensors
+llama_model_loader: - type bf16:  170 tensors
+================================ Have weights data with 168 entries
+[   1/ 219]                        output.weight - [ 2048, 256000,     1,     1], type =   bf16, size = 1000.000 MB
+[   2/ 219]                    token_embd.weight - [ 2048, 256000,     1,     1], type =   bf16,
+====== llama_model_quantize_internal: did not find weights for token_embd.weight
+converting to iq4_nl .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt'
+load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks
+prepare_imatrix: have 168 importance matrix entries
+size =  1000.00 MiB ->   281.25 MiB
+[   3/ 219]               blk.0.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   4/ 219]                blk.0.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[   5/ 219]                blk.0.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[   6/ 219]                  blk.0.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[   7/ 219]                blk.0.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   8/ 219]                  blk.0.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[   9/ 219]             blk.0.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  10/ 219]                  blk.0.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  11/ 219]                  blk.0.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  12/ 219]               blk.1.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  13/ 219]                blk.1.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  14/ 219]                blk.1.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  15/ 219]                  blk.1.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  16/ 219]                blk.1.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  17/ 219]                  blk.1.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  18/ 219]             blk.1.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  19/ 219]                  blk.1.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  20/ 219]                  blk.1.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  21/ 219]              blk.10.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  22/ 219]               blk.10.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  23/ 219]               blk.10.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  24/ 219]                 blk.10.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  25/ 219]               blk.10.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  26/ 219]                 blk.10.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  27/ 219]            blk.10.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  28/ 219]                 blk.10.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  29/ 219]                 blk.10.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  30/ 219]              blk.11.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  31/ 219]               blk.11.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  32/ 219]               blk.11.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  33/ 219]                 blk.11.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  34/ 219]               blk.11.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  35/ 219]                 blk.11.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  36/ 219]            blk.11.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  37/ 219]                 blk.11.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  38/ 219]                 blk.11.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  39/ 219]              blk.12.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  40/ 219]               blk.12.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  41/ 219]               blk.12.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  42/ 219]                 blk.12.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  43/ 219]               blk.12.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  44/ 219]                 blk.12.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  45/ 219]            blk.12.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  46/ 219]                 blk.12.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  47/ 219]                 blk.12.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  48/ 219]              blk.13.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  49/ 219]               blk.13.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  50/ 219]               blk.13.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  51/ 219]                 blk.13.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  52/ 219]               blk.13.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  53/ 219]                 blk.13.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  54/ 219]            blk.13.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  55/ 219]                 blk.13.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  56/ 219]                 blk.13.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  57/ 219]              blk.14.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  58/ 219]               blk.14.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  59/ 219]               blk.14.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  60/ 219]                 blk.14.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  61/ 219]               blk.14.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  62/ 219]                 blk.14.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  63/ 219]            blk.14.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  64/ 219]                 blk.14.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  65/ 219]                 blk.14.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  66/ 219]              blk.15.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  67/ 219]               blk.15.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  68/ 219]               blk.15.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  69/ 219]                 blk.15.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  70/ 219]               blk.15.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  71/ 219]                 blk.15.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  72/ 219]            blk.15.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  73/ 219]                 blk.15.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  74/ 219]                 blk.15.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  75/ 219]              blk.16.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  76/ 219]               blk.16.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  77/ 219]               blk.16.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  78/ 219]                 blk.16.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  79/ 219]               blk.16.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  80/ 219]                 blk.16.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  81/ 219]            blk.16.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  82/ 219]                 blk.16.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  83/ 219]                 blk.16.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  84/ 219]              blk.17.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  85/ 219]               blk.17.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  86/ 219]               blk.17.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  87/ 219]                 blk.17.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  88/ 219]               blk.17.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  89/ 219]                 blk.17.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  90/ 219]            blk.17.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  91/ 219]                 blk.17.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  92/ 219]                 blk.17.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  93/ 219]              blk.18.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  94/ 219]               blk.18.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  95/ 219]               blk.18.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  96/ 219]                 blk.18.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  97/ 219]               blk.18.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  98/ 219]                 blk.18.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[  99/ 219]            blk.18.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 100/ 219]                 blk.18.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 101/ 219]                 blk.18.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 102/ 219]              blk.19.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 103/ 219]               blk.19.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 104/ 219]               blk.19.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 105/ 219]                 blk.19.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 106/ 219]               blk.19.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 107/ 219]                 blk.19.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 108/ 219]            blk.19.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 109/ 219]                 blk.19.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 110/ 219]                 blk.19.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 111/ 219]               blk.2.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 112/ 219]                blk.2.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 113/ 219]                blk.2.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 114/ 219]                  blk.2.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 115/ 219]                blk.2.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 116/ 219]                  blk.2.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 117/ 219]             blk.2.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 118/ 219]                  blk.2.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 119/ 219]                  blk.2.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 120/ 219]              blk.20.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 121/ 219]               blk.20.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 122/ 219]               blk.20.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 123/ 219]                 blk.20.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 124/ 219]               blk.20.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 125/ 219]                 blk.20.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 126/ 219]            blk.20.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 127/ 219]                 blk.20.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 128/ 219]                 blk.20.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 129/ 219]              blk.21.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 130/ 219]               blk.21.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 131/ 219]               blk.21.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 132/ 219]                 blk.21.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 133/ 219]               blk.21.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 134/ 219]                 blk.21.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 135/ 219]            blk.21.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 136/ 219]                 blk.21.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 137/ 219]                 blk.21.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 138/ 219]              blk.22.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 139/ 219]               blk.22.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 140/ 219]               blk.22.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 141/ 219]                 blk.22.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 142/ 219]               blk.22.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 143/ 219]                 blk.22.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 144/ 219]            blk.22.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 145/ 219]                 blk.22.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 146/ 219]                 blk.22.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 147/ 219]              blk.23.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 148/ 219]               blk.23.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 149/ 219]               blk.23.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 150/ 219]                 blk.23.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 151/ 219]               blk.23.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 152/ 219]                 blk.23.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 153/ 219]            blk.23.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 154/ 219]                 blk.23.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 155/ 219]                 blk.23.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 156/ 219]               blk.3.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 157/ 219]                blk.3.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 158/ 219]                blk.3.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 159/ 219]                  blk.3.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 160/ 219]                blk.3.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 161/ 219]                  blk.3.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 162/ 219]             blk.3.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 163/ 219]                  blk.3.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 164/ 219]                  blk.3.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 165/ 219]               blk.4.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 166/ 219]                blk.4.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 167/ 219]                blk.4.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 168/ 219]                  blk.4.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 169/ 219]                blk.4.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 170/ 219]                  blk.4.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 171/ 219]             blk.4.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 172/ 219]                  blk.4.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 173/ 219]                  blk.4.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 174/ 219]               blk.5.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 175/ 219]                blk.5.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 176/ 219]                blk.5.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 177/ 219]                  blk.5.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 178/ 219]                blk.5.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 179/ 219]                  blk.5.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 180/ 219]             blk.5.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 181/ 219]                  blk.5.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 182/ 219]                  blk.5.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 183/ 219]               blk.6.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 184/ 219]                blk.6.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 185/ 219]                blk.6.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 186/ 219]                  blk.6.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 187/ 219]                blk.6.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 188/ 219]                  blk.6.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 189/ 219]             blk.6.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 190/ 219]                  blk.6.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 191/ 219]                  blk.6.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 192/ 219]               blk.7.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 193/ 219]                blk.7.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 194/ 219]                blk.7.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 195/ 219]                  blk.7.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 196/ 219]                blk.7.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 197/ 219]                  blk.7.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 198/ 219]             blk.7.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 199/ 219]                  blk.7.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 200/ 219]                  blk.7.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 201/ 219]               blk.8.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 202/ 219]                blk.8.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 203/ 219]                blk.8.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 204/ 219]                  blk.8.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 205/ 219]                blk.8.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 206/ 219]                  blk.8.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 207/ 219]             blk.8.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 208/ 219]                  blk.8.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 209/ 219]                  blk.8.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 210/ 219]               blk.9.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 211/ 219]                blk.9.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 212/ 219]                blk.9.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 213/ 219]                  blk.9.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 214/ 219]                blk.9.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 215/ 219]                  blk.9.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 216/ 219]             blk.9.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 217/ 219]                  blk.9.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 218/ 219]                  blk.9.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_nl .. size =     8.00 MiB ->     2.25 MiB
+[ 219/ 219]                   output_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+llama_model_quantize_internal: model size  =  4298.38 MB
+llama_model_quantize_internal: quant size  =  1927.95 MB
+main: quantize time = 18024.91 ms
+main:    total time = 18024.91 ms

IQ4_XS_log.txt ADDED Viewed

	@@ -0,0 +1,339 @@

+main: build = 3906 (7eee341b)
+main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0
+main: quantizing 'salamandra-2b_bf16.gguf' to './salamandra-2b_IQ4_XS.gguf' as IQ4_XS
+llama_model_loader: loaded meta data with 29 key-value pairs and 219 tensors from salamandra-2b_bf16.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                         general.size_label str              = 2.3B
+llama_model_loader: - kv   3:                            general.license str              = apache-2.0
+llama_model_loader: - kv   4:                               general.tags arr[str,1]       = ["text-generation"]
+llama_model_loader: - kv   5:                          general.languages arr[str,36]      = ["bg", "ca", "code", "cs", "cy", "da"...
+llama_model_loader: - kv   6:                          llama.block_count u32              = 24
+llama_model_loader: - kv   7:                       llama.context_length u32              = 8192
+llama_model_loader: - kv   8:                     llama.embedding_length u32              = 2048
+llama_model_loader: - kv   9:                  llama.feed_forward_length u32              = 5440
+llama_model_loader: - kv  10:                 llama.attention.head_count u32              = 16
+llama_model_loader: - kv  11:              llama.attention.head_count_kv u32              = 16
+llama_model_loader: - kv  12:                       llama.rope.freq_base f32              = 10000.000000
+llama_model_loader: - kv  13:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  14:                          general.file_type u32              = 32
+llama_model_loader: - kv  15:                           llama.vocab_size u32              = 256000
+llama_model_loader: - kv  16:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv  17:            tokenizer.ggml.add_space_prefix bool             = true
+llama_model_loader: - kv  18:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  19:                         tokenizer.ggml.pre str              = default
+llama_model_loader: - kv  20:                      tokenizer.ggml.tokens arr[str,256000]  = ["<unk>", "<s>", "</s>", "<pad>", "<|...
+llama_model_loader: - kv  21:                      tokenizer.ggml.scores arr[f32,256000]  = [-1000.000000, -1000.000000, -1000.00...
+llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,256000]  = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
+llama_model_loader: - kv  23:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 2
+llama_model_loader: - kv  25:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  26:               tokenizer.ggml.add_bos_token bool             = true
+llama_model_loader: - kv  27:               tokenizer.ggml.add_eos_token bool             = false
+llama_model_loader: - kv  28:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   49 tensors
+llama_model_loader: - type bf16:  170 tensors
+================================ Have weights data with 168 entries
+[   1/ 219]                        output.weight - [ 2048, 256000,     1,     1], type =   bf16, size = 1000.000 MB
+[   2/ 219]                    token_embd.weight - [ 2048, 256000,     1,     1], type =   bf16,
+====== llama_model_quantize_internal: did not find weights for token_embd.weight
+converting to iq4_xs .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt'
+load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks
+prepare_imatrix: have 168 importance matrix entries
+size =  1000.00 MiB ->   265.62 MiB
+[   3/ 219]               blk.0.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   4/ 219]                blk.0.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[   5/ 219]                blk.0.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[   6/ 219]                  blk.0.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[   7/ 219]                blk.0.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   8/ 219]                  blk.0.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[   9/ 219]             blk.0.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  10/ 219]                  blk.0.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  11/ 219]                  blk.0.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  12/ 219]               blk.1.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  13/ 219]                blk.1.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  14/ 219]                blk.1.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  15/ 219]                  blk.1.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  16/ 219]                blk.1.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  17/ 219]                  blk.1.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  18/ 219]             blk.1.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  19/ 219]                  blk.1.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  20/ 219]                  blk.1.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  21/ 219]              blk.10.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  22/ 219]               blk.10.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  23/ 219]               blk.10.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  24/ 219]                 blk.10.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  25/ 219]               blk.10.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  26/ 219]                 blk.10.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  27/ 219]            blk.10.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  28/ 219]                 blk.10.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  29/ 219]                 blk.10.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  30/ 219]              blk.11.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  31/ 219]               blk.11.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  32/ 219]               blk.11.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  33/ 219]                 blk.11.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  34/ 219]               blk.11.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  35/ 219]                 blk.11.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  36/ 219]            blk.11.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  37/ 219]                 blk.11.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  38/ 219]                 blk.11.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  39/ 219]              blk.12.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  40/ 219]               blk.12.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  41/ 219]               blk.12.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  42/ 219]                 blk.12.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  43/ 219]               blk.12.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  44/ 219]                 blk.12.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  45/ 219]            blk.12.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  46/ 219]                 blk.12.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  47/ 219]                 blk.12.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  48/ 219]              blk.13.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  49/ 219]               blk.13.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  50/ 219]               blk.13.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  51/ 219]                 blk.13.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  52/ 219]               blk.13.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  53/ 219]                 blk.13.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  54/ 219]            blk.13.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  55/ 219]                 blk.13.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  56/ 219]                 blk.13.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  57/ 219]              blk.14.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  58/ 219]               blk.14.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  59/ 219]               blk.14.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  60/ 219]                 blk.14.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  61/ 219]               blk.14.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  62/ 219]                 blk.14.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  63/ 219]            blk.14.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  64/ 219]                 blk.14.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  65/ 219]                 blk.14.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  66/ 219]              blk.15.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  67/ 219]               blk.15.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  68/ 219]               blk.15.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  69/ 219]                 blk.15.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  70/ 219]               blk.15.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  71/ 219]                 blk.15.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  72/ 219]            blk.15.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  73/ 219]                 blk.15.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  74/ 219]                 blk.15.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  75/ 219]              blk.16.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  76/ 219]               blk.16.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  77/ 219]               blk.16.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  78/ 219]                 blk.16.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  79/ 219]               blk.16.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  80/ 219]                 blk.16.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  81/ 219]            blk.16.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  82/ 219]                 blk.16.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  83/ 219]                 blk.16.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  84/ 219]              blk.17.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  85/ 219]               blk.17.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  86/ 219]               blk.17.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  87/ 219]                 blk.17.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  88/ 219]               blk.17.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  89/ 219]                 blk.17.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  90/ 219]            blk.17.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  91/ 219]                 blk.17.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  92/ 219]                 blk.17.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  93/ 219]              blk.18.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  94/ 219]               blk.18.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  95/ 219]               blk.18.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  96/ 219]                 blk.18.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[  97/ 219]               blk.18.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  98/ 219]                 blk.18.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[  99/ 219]            blk.18.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 100/ 219]                 blk.18.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 101/ 219]                 blk.18.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 102/ 219]              blk.19.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 103/ 219]               blk.19.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 104/ 219]               blk.19.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 105/ 219]                 blk.19.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 106/ 219]               blk.19.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 107/ 219]                 blk.19.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 108/ 219]            blk.19.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 109/ 219]                 blk.19.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 110/ 219]                 blk.19.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 111/ 219]               blk.2.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 112/ 219]                blk.2.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 113/ 219]                blk.2.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 114/ 219]                  blk.2.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 115/ 219]                blk.2.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 116/ 219]                  blk.2.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 117/ 219]             blk.2.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 118/ 219]                  blk.2.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 119/ 219]                  blk.2.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 120/ 219]              blk.20.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 121/ 219]               blk.20.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 122/ 219]               blk.20.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 123/ 219]                 blk.20.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 124/ 219]               blk.20.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 125/ 219]                 blk.20.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 126/ 219]            blk.20.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 127/ 219]                 blk.20.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 128/ 219]                 blk.20.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 129/ 219]              blk.21.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 130/ 219]               blk.21.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 131/ 219]               blk.21.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 132/ 219]                 blk.21.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 133/ 219]               blk.21.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 134/ 219]                 blk.21.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 135/ 219]            blk.21.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 136/ 219]                 blk.21.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 137/ 219]                 blk.21.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 138/ 219]              blk.22.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 139/ 219]               blk.22.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 140/ 219]               blk.22.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 141/ 219]                 blk.22.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 142/ 219]               blk.22.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 143/ 219]                 blk.22.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 144/ 219]            blk.22.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 145/ 219]                 blk.22.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 146/ 219]                 blk.22.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 147/ 219]              blk.23.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 148/ 219]               blk.23.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 149/ 219]               blk.23.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 150/ 219]                 blk.23.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 151/ 219]               blk.23.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 152/ 219]                 blk.23.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 153/ 219]            blk.23.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 154/ 219]                 blk.23.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 155/ 219]                 blk.23.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 156/ 219]               blk.3.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 157/ 219]                blk.3.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 158/ 219]                blk.3.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 159/ 219]                  blk.3.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 160/ 219]                blk.3.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 161/ 219]                  blk.3.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 162/ 219]             blk.3.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 163/ 219]                  blk.3.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 164/ 219]                  blk.3.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 165/ 219]               blk.4.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 166/ 219]                blk.4.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 167/ 219]                blk.4.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 168/ 219]                  blk.4.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 169/ 219]                blk.4.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 170/ 219]                  blk.4.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 171/ 219]             blk.4.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 172/ 219]                  blk.4.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 173/ 219]                  blk.4.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 174/ 219]               blk.5.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 175/ 219]                blk.5.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 176/ 219]                blk.5.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 177/ 219]                  blk.5.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 178/ 219]                blk.5.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 179/ 219]                  blk.5.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 180/ 219]             blk.5.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 181/ 219]                  blk.5.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 182/ 219]                  blk.5.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 183/ 219]               blk.6.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 184/ 219]                blk.6.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 185/ 219]                blk.6.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 186/ 219]                  blk.6.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 187/ 219]                blk.6.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 188/ 219]                  blk.6.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 189/ 219]             blk.6.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 190/ 219]                  blk.6.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 191/ 219]                  blk.6.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 192/ 219]               blk.7.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 193/ 219]                blk.7.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 194/ 219]                blk.7.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 195/ 219]                  blk.7.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 196/ 219]                blk.7.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 197/ 219]                  blk.7.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 198/ 219]             blk.7.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 199/ 219]                  blk.7.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 200/ 219]                  blk.7.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 201/ 219]               blk.8.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 202/ 219]                blk.8.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 203/ 219]                blk.8.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 204/ 219]                  blk.8.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 205/ 219]                blk.8.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 206/ 219]                  blk.8.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 207/ 219]             blk.8.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 208/ 219]                  blk.8.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 209/ 219]                  blk.8.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 210/ 219]               blk.9.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 211/ 219]                blk.9.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq4_xs - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 212/ 219]                blk.9.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 213/ 219]                  blk.9.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to iq4_xs .. size =    21.25 MiB ->     5.64 MiB
+[ 214/ 219]                blk.9.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 215/ 219]                  blk.9.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 216/ 219]             blk.9.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 217/ 219]                  blk.9.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 218/ 219]                  blk.9.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to iq4_xs .. size =     8.00 MiB ->     2.12 MiB
+[ 219/ 219]                   output_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+llama_model_quantize_internal: model size  =  4298.38 MB
+llama_model_quantize_internal: quant size  =  1884.38 MB
+llama_model_quantize_internal: WARNING: 24 of 169 tensor(s) required fallback quantization
+main: quantize time = 18604.79 ms
+main:    total time = 18604.79 ms

Q3_K_L_log.txt ADDED Viewed

	@@ -0,0 +1,339 @@

+main: build = 3906 (7eee341b)
+main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0
+main: quantizing 'salamandra-2b_bf16.gguf' to './salamandra-2b_Q3_K_L.gguf' as Q3_K_L
+llama_model_loader: loaded meta data with 29 key-value pairs and 219 tensors from salamandra-2b_bf16.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                         general.size_label str              = 2.3B
+llama_model_loader: - kv   3:                            general.license str              = apache-2.0
+llama_model_loader: - kv   4:                               general.tags arr[str,1]       = ["text-generation"]
+llama_model_loader: - kv   5:                          general.languages arr[str,36]      = ["bg", "ca", "code", "cs", "cy", "da"...
+llama_model_loader: - kv   6:                          llama.block_count u32              = 24
+llama_model_loader: - kv   7:                       llama.context_length u32              = 8192
+llama_model_loader: - kv   8:                     llama.embedding_length u32              = 2048
+llama_model_loader: - kv   9:                  llama.feed_forward_length u32              = 5440
+llama_model_loader: - kv  10:                 llama.attention.head_count u32              = 16
+llama_model_loader: - kv  11:              llama.attention.head_count_kv u32              = 16
+llama_model_loader: - kv  12:                       llama.rope.freq_base f32              = 10000.000000
+llama_model_loader: - kv  13:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  14:                          general.file_type u32              = 32
+llama_model_loader: - kv  15:                           llama.vocab_size u32              = 256000
+llama_model_loader: - kv  16:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv  17:            tokenizer.ggml.add_space_prefix bool             = true
+llama_model_loader: - kv  18:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  19:                         tokenizer.ggml.pre str              = default
+llama_model_loader: - kv  20:                      tokenizer.ggml.tokens arr[str,256000]  = ["<unk>", "<s>", "</s>", "<pad>", "<|...
+llama_model_loader: - kv  21:                      tokenizer.ggml.scores arr[f32,256000]  = [-1000.000000, -1000.000000, -1000.00...
+llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,256000]  = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
+llama_model_loader: - kv  23:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 2
+llama_model_loader: - kv  25:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  26:               tokenizer.ggml.add_bos_token bool             = true
+llama_model_loader: - kv  27:               tokenizer.ggml.add_eos_token bool             = false
+llama_model_loader: - kv  28:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   49 tensors
+llama_model_loader: - type bf16:  170 tensors
+================================ Have weights data with 168 entries
+[   1/ 219]                        output.weight - [ 2048, 256000,     1,     1], type =   bf16, size = 1000.000 MB
+[   2/ 219]                    token_embd.weight - [ 2048, 256000,     1,     1], type =   bf16,
+====== llama_model_quantize_internal: did not find weights for token_embd.weight
+converting to q3_K .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt'
+load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks
+prepare_imatrix: have 168 importance matrix entries
+size =  1000.00 MiB ->   214.84 MiB
+[   3/ 219]               blk.0.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   4/ 219]                blk.0.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[   5/ 219]                blk.0.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[   6/ 219]                  blk.0.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[   7/ 219]                blk.0.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   8/ 219]                  blk.0.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[   9/ 219]             blk.0.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  10/ 219]                  blk.0.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  11/ 219]                  blk.0.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  12/ 219]               blk.1.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  13/ 219]                blk.1.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  14/ 219]                blk.1.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  15/ 219]                  blk.1.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  16/ 219]                blk.1.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  17/ 219]                  blk.1.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  18/ 219]             blk.1.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  19/ 219]                  blk.1.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  20/ 219]                  blk.1.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  21/ 219]              blk.10.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  22/ 219]               blk.10.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  23/ 219]               blk.10.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  24/ 219]                 blk.10.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  25/ 219]               blk.10.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  26/ 219]                 blk.10.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  27/ 219]            blk.10.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  28/ 219]                 blk.10.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  29/ 219]                 blk.10.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  30/ 219]              blk.11.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  31/ 219]               blk.11.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  32/ 219]               blk.11.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  33/ 219]                 blk.11.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  34/ 219]               blk.11.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  35/ 219]                 blk.11.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  36/ 219]            blk.11.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  37/ 219]                 blk.11.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  38/ 219]                 blk.11.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  39/ 219]              blk.12.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  40/ 219]               blk.12.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  41/ 219]               blk.12.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  42/ 219]                 blk.12.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  43/ 219]               blk.12.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  44/ 219]                 blk.12.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  45/ 219]            blk.12.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  46/ 219]                 blk.12.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  47/ 219]                 blk.12.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  48/ 219]              blk.13.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  49/ 219]               blk.13.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  50/ 219]               blk.13.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  51/ 219]                 blk.13.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  52/ 219]               blk.13.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  53/ 219]                 blk.13.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  54/ 219]            blk.13.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  55/ 219]                 blk.13.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  56/ 219]                 blk.13.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  57/ 219]              blk.14.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  58/ 219]               blk.14.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  59/ 219]               blk.14.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  60/ 219]                 blk.14.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  61/ 219]               blk.14.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  62/ 219]                 blk.14.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  63/ 219]            blk.14.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  64/ 219]                 blk.14.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  65/ 219]                 blk.14.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  66/ 219]              blk.15.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  67/ 219]               blk.15.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  68/ 219]               blk.15.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  69/ 219]                 blk.15.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  70/ 219]               blk.15.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  71/ 219]                 blk.15.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  72/ 219]            blk.15.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  73/ 219]                 blk.15.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  74/ 219]                 blk.15.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  75/ 219]              blk.16.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  76/ 219]               blk.16.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  77/ 219]               blk.16.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  78/ 219]                 blk.16.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  79/ 219]               blk.16.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  80/ 219]                 blk.16.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  81/ 219]            blk.16.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  82/ 219]                 blk.16.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  83/ 219]                 blk.16.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  84/ 219]              blk.17.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  85/ 219]               blk.17.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  86/ 219]               blk.17.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  87/ 219]                 blk.17.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  88/ 219]               blk.17.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  89/ 219]                 blk.17.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  90/ 219]            blk.17.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  91/ 219]                 blk.17.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  92/ 219]                 blk.17.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  93/ 219]              blk.18.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  94/ 219]               blk.18.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  95/ 219]               blk.18.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  96/ 219]                 blk.18.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  97/ 219]               blk.18.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  98/ 219]                 blk.18.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  99/ 219]            blk.18.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 100/ 219]                 blk.18.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 101/ 219]                 blk.18.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 102/ 219]              blk.19.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 103/ 219]               blk.19.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 104/ 219]               blk.19.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 105/ 219]                 blk.19.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 106/ 219]               blk.19.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 107/ 219]                 blk.19.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 108/ 219]            blk.19.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 109/ 219]                 blk.19.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 110/ 219]                 blk.19.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 111/ 219]               blk.2.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 112/ 219]                blk.2.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 113/ 219]                blk.2.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 114/ 219]                  blk.2.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 115/ 219]                blk.2.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 116/ 219]                  blk.2.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 117/ 219]             blk.2.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 118/ 219]                  blk.2.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 119/ 219]                  blk.2.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 120/ 219]              blk.20.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 121/ 219]               blk.20.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 122/ 219]               blk.20.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 123/ 219]                 blk.20.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 124/ 219]               blk.20.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 125/ 219]                 blk.20.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 126/ 219]            blk.20.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 127/ 219]                 blk.20.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 128/ 219]                 blk.20.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 129/ 219]              blk.21.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 130/ 219]               blk.21.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 131/ 219]               blk.21.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 132/ 219]                 blk.21.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 133/ 219]               blk.21.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 134/ 219]                 blk.21.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 135/ 219]            blk.21.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 136/ 219]                 blk.21.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 137/ 219]                 blk.21.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 138/ 219]              blk.22.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 139/ 219]               blk.22.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 140/ 219]               blk.22.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 141/ 219]                 blk.22.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 142/ 219]               blk.22.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 143/ 219]                 blk.22.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 144/ 219]            blk.22.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 145/ 219]                 blk.22.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 146/ 219]                 blk.22.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 147/ 219]              blk.23.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 148/ 219]               blk.23.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 149/ 219]               blk.23.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 150/ 219]                 blk.23.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 151/ 219]               blk.23.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 152/ 219]                 blk.23.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 153/ 219]            blk.23.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 154/ 219]                 blk.23.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 155/ 219]                 blk.23.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 156/ 219]               blk.3.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 157/ 219]                blk.3.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 158/ 219]                blk.3.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 159/ 219]                  blk.3.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 160/ 219]                blk.3.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 161/ 219]                  blk.3.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 162/ 219]             blk.3.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 163/ 219]                  blk.3.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 164/ 219]                  blk.3.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 165/ 219]               blk.4.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 166/ 219]                blk.4.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 167/ 219]                blk.4.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 168/ 219]                  blk.4.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 169/ 219]                blk.4.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 170/ 219]                  blk.4.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 171/ 219]             blk.4.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 172/ 219]                  blk.4.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 173/ 219]                  blk.4.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 174/ 219]               blk.5.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 175/ 219]                blk.5.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 176/ 219]                blk.5.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 177/ 219]                  blk.5.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 178/ 219]                blk.5.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 179/ 219]                  blk.5.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 180/ 219]             blk.5.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 181/ 219]                  blk.5.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 182/ 219]                  blk.5.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 183/ 219]               blk.6.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 184/ 219]                blk.6.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 185/ 219]                blk.6.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 186/ 219]                  blk.6.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 187/ 219]                blk.6.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 188/ 219]                  blk.6.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 189/ 219]             blk.6.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 190/ 219]                  blk.6.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 191/ 219]                  blk.6.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 192/ 219]               blk.7.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 193/ 219]                blk.7.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 194/ 219]                blk.7.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 195/ 219]                  blk.7.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 196/ 219]                blk.7.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 197/ 219]                  blk.7.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 198/ 219]             blk.7.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 199/ 219]                  blk.7.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 200/ 219]                  blk.7.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 201/ 219]               blk.8.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 202/ 219]                blk.8.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 203/ 219]                blk.8.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 204/ 219]                  blk.8.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 205/ 219]                blk.8.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 206/ 219]                  blk.8.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 207/ 219]             blk.8.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 208/ 219]                  blk.8.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 209/ 219]                  blk.8.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 210/ 219]               blk.9.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 211/ 219]                blk.9.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 212/ 219]                blk.9.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 213/ 219]                  blk.9.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 214/ 219]                blk.9.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 215/ 219]                  blk.9.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 216/ 219]             blk.9.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 217/ 219]                  blk.9.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 218/ 219]                  blk.9.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 219/ 219]                   output_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+llama_model_quantize_internal: model size  =  4298.38 MB
+llama_model_quantize_internal: quant size  =  1840.12 MB
+llama_model_quantize_internal: WARNING: 24 of 169 tensor(s) required fallback quantization
+main: quantize time =  6546.55 ms
+main:    total time =  6546.55 ms

Q3_K_M_log.txt ADDED Viewed

	@@ -0,0 +1,339 @@

+main: build = 3906 (7eee341b)
+main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0
+main: quantizing 'salamandra-2b_bf16.gguf' to './salamandra-2b_Q3_K_M.gguf' as Q3_K_M
+llama_model_loader: loaded meta data with 29 key-value pairs and 219 tensors from salamandra-2b_bf16.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                         general.size_label str              = 2.3B
+llama_model_loader: - kv   3:                            general.license str              = apache-2.0
+llama_model_loader: - kv   4:                               general.tags arr[str,1]       = ["text-generation"]
+llama_model_loader: - kv   5:                          general.languages arr[str,36]      = ["bg", "ca", "code", "cs", "cy", "da"...
+llama_model_loader: - kv   6:                          llama.block_count u32              = 24
+llama_model_loader: - kv   7:                       llama.context_length u32              = 8192
+llama_model_loader: - kv   8:                     llama.embedding_length u32              = 2048
+llama_model_loader: - kv   9:                  llama.feed_forward_length u32              = 5440
+llama_model_loader: - kv  10:                 llama.attention.head_count u32              = 16
+llama_model_loader: - kv  11:              llama.attention.head_count_kv u32              = 16
+llama_model_loader: - kv  12:                       llama.rope.freq_base f32              = 10000.000000
+llama_model_loader: - kv  13:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  14:                          general.file_type u32              = 32
+llama_model_loader: - kv  15:                           llama.vocab_size u32              = 256000
+llama_model_loader: - kv  16:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv  17:            tokenizer.ggml.add_space_prefix bool             = true
+llama_model_loader: - kv  18:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  19:                         tokenizer.ggml.pre str              = default
+llama_model_loader: - kv  20:                      tokenizer.ggml.tokens arr[str,256000]  = ["<unk>", "<s>", "</s>", "<pad>", "<|...
+llama_model_loader: - kv  21:                      tokenizer.ggml.scores arr[f32,256000]  = [-1000.000000, -1000.000000, -1000.00...
+llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,256000]  = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
+llama_model_loader: - kv  23:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 2
+llama_model_loader: - kv  25:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  26:               tokenizer.ggml.add_bos_token bool             = true
+llama_model_loader: - kv  27:               tokenizer.ggml.add_eos_token bool             = false
+llama_model_loader: - kv  28:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   49 tensors
+llama_model_loader: - type bf16:  170 tensors
+================================ Have weights data with 168 entries
+[   1/ 219]                        output.weight - [ 2048, 256000,     1,     1], type =   bf16, size = 1000.000 MB
+[   2/ 219]                    token_embd.weight - [ 2048, 256000,     1,     1], type =   bf16,
+====== llama_model_quantize_internal: did not find weights for token_embd.weight
+converting to q3_K .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt'
+load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks
+prepare_imatrix: have 168 importance matrix entries
+size =  1000.00 MiB ->   214.84 MiB
+[   3/ 219]               blk.0.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   4/ 219]                blk.0.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[   5/ 219]                blk.0.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[   6/ 219]                  blk.0.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[   7/ 219]                blk.0.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   8/ 219]                  blk.0.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[   9/ 219]             blk.0.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  10/ 219]                  blk.0.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  11/ 219]                  blk.0.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  12/ 219]               blk.1.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  13/ 219]                blk.1.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  14/ 219]                blk.1.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  15/ 219]                  blk.1.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  16/ 219]                blk.1.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  17/ 219]                  blk.1.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  18/ 219]             blk.1.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  19/ 219]                  blk.1.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  20/ 219]                  blk.1.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  21/ 219]              blk.10.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  22/ 219]               blk.10.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  23/ 219]               blk.10.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  24/ 219]                 blk.10.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  25/ 219]               blk.10.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  26/ 219]                 blk.10.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  27/ 219]            blk.10.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  28/ 219]                 blk.10.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  29/ 219]                 blk.10.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  30/ 219]              blk.11.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  31/ 219]               blk.11.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  32/ 219]               blk.11.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  33/ 219]                 blk.11.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  34/ 219]               blk.11.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  35/ 219]                 blk.11.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  36/ 219]            blk.11.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  37/ 219]                 blk.11.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  38/ 219]                 blk.11.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  39/ 219]              blk.12.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  40/ 219]               blk.12.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  41/ 219]               blk.12.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  42/ 219]                 blk.12.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  43/ 219]               blk.12.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  44/ 219]                 blk.12.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  45/ 219]            blk.12.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  46/ 219]                 blk.12.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  47/ 219]                 blk.12.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  48/ 219]              blk.13.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  49/ 219]               blk.13.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  50/ 219]               blk.13.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  51/ 219]                 blk.13.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  52/ 219]               blk.13.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  53/ 219]                 blk.13.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  54/ 219]            blk.13.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  55/ 219]                 blk.13.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  56/ 219]                 blk.13.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  57/ 219]              blk.14.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  58/ 219]               blk.14.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  59/ 219]               blk.14.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  60/ 219]                 blk.14.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  61/ 219]               blk.14.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  62/ 219]                 blk.14.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  63/ 219]            blk.14.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  64/ 219]                 blk.14.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  65/ 219]                 blk.14.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  66/ 219]              blk.15.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  67/ 219]               blk.15.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  68/ 219]               blk.15.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  69/ 219]                 blk.15.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  70/ 219]               blk.15.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  71/ 219]                 blk.15.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  72/ 219]            blk.15.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  73/ 219]                 blk.15.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  74/ 219]                 blk.15.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  75/ 219]              blk.16.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  76/ 219]               blk.16.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  77/ 219]               blk.16.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  78/ 219]                 blk.16.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  79/ 219]               blk.16.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  80/ 219]                 blk.16.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  81/ 219]            blk.16.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  82/ 219]                 blk.16.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  83/ 219]                 blk.16.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  84/ 219]              blk.17.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  85/ 219]               blk.17.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  86/ 219]               blk.17.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  87/ 219]                 blk.17.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  88/ 219]               blk.17.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  89/ 219]                 blk.17.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  90/ 219]            blk.17.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  91/ 219]                 blk.17.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  92/ 219]                 blk.17.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  93/ 219]              blk.18.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  94/ 219]               blk.18.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  95/ 219]               blk.18.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  96/ 219]                 blk.18.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  97/ 219]               blk.18.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  98/ 219]                 blk.18.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  99/ 219]            blk.18.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 100/ 219]                 blk.18.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 101/ 219]                 blk.18.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 102/ 219]              blk.19.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 103/ 219]               blk.19.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 104/ 219]               blk.19.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 105/ 219]                 blk.19.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 106/ 219]               blk.19.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 107/ 219]                 blk.19.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 108/ 219]            blk.19.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 109/ 219]                 blk.19.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 110/ 219]                 blk.19.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 111/ 219]               blk.2.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 112/ 219]                blk.2.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 113/ 219]                blk.2.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 114/ 219]                  blk.2.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 115/ 219]                blk.2.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 116/ 219]                  blk.2.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 117/ 219]             blk.2.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 118/ 219]                  blk.2.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 119/ 219]                  blk.2.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 120/ 219]              blk.20.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 121/ 219]               blk.20.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 122/ 219]               blk.20.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 123/ 219]                 blk.20.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 124/ 219]               blk.20.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 125/ 219]                 blk.20.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 126/ 219]            blk.20.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 127/ 219]                 blk.20.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 128/ 219]                 blk.20.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 129/ 219]              blk.21.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 130/ 219]               blk.21.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 131/ 219]               blk.21.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 132/ 219]                 blk.21.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 133/ 219]               blk.21.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 134/ 219]                 blk.21.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 135/ 219]            blk.21.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 136/ 219]                 blk.21.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 137/ 219]                 blk.21.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 138/ 219]              blk.22.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 139/ 219]               blk.22.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 140/ 219]               blk.22.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 141/ 219]                 blk.22.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 142/ 219]               blk.22.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 143/ 219]                 blk.22.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 144/ 219]            blk.22.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 145/ 219]                 blk.22.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 146/ 219]                 blk.22.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 147/ 219]              blk.23.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 148/ 219]               blk.23.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 149/ 219]               blk.23.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 150/ 219]                 blk.23.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 151/ 219]               blk.23.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 152/ 219]                 blk.23.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 153/ 219]            blk.23.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 154/ 219]                 blk.23.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 155/ 219]                 blk.23.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 156/ 219]               blk.3.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 157/ 219]                blk.3.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 158/ 219]                blk.3.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 159/ 219]                  blk.3.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 160/ 219]                blk.3.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 161/ 219]                  blk.3.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 162/ 219]             blk.3.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 163/ 219]                  blk.3.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 164/ 219]                  blk.3.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 165/ 219]               blk.4.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 166/ 219]                blk.4.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 167/ 219]                blk.4.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 168/ 219]                  blk.4.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 169/ 219]                blk.4.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 170/ 219]                  blk.4.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 171/ 219]             blk.4.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 172/ 219]                  blk.4.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 173/ 219]                  blk.4.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 174/ 219]               blk.5.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 175/ 219]                blk.5.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 176/ 219]                blk.5.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 177/ 219]                  blk.5.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 178/ 219]                blk.5.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 179/ 219]                  blk.5.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 180/ 219]             blk.5.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 181/ 219]                  blk.5.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 182/ 219]                  blk.5.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 183/ 219]               blk.6.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 184/ 219]                blk.6.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 185/ 219]                blk.6.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 186/ 219]                  blk.6.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 187/ 219]                blk.6.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 188/ 219]                  blk.6.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 189/ 219]             blk.6.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 190/ 219]                  blk.6.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 191/ 219]                  blk.6.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 192/ 219]               blk.7.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 193/ 219]                blk.7.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 194/ 219]                blk.7.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 195/ 219]                  blk.7.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 196/ 219]                blk.7.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 197/ 219]                  blk.7.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 198/ 219]             blk.7.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 199/ 219]                  blk.7.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 200/ 219]                  blk.7.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 201/ 219]               blk.8.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 202/ 219]                blk.8.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 203/ 219]                blk.8.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 204/ 219]                  blk.8.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 205/ 219]                blk.8.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 206/ 219]                  blk.8.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 207/ 219]             blk.8.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 208/ 219]                  blk.8.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 209/ 219]                  blk.8.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 210/ 219]               blk.9.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 211/ 219]                blk.9.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 212/ 219]                blk.9.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 213/ 219]                  blk.9.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 214/ 219]                blk.9.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 215/ 219]                  blk.9.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 216/ 219]             blk.9.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 217/ 219]                  blk.9.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 218/ 219]                  blk.9.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 219/ 219]                   output_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+llama_model_quantize_internal: model size  =  4298.38 MB
+llama_model_quantize_internal: quant size  =  1801.84 MB
+llama_model_quantize_internal: WARNING: 24 of 169 tensor(s) required fallback quantization
+main: quantize time =  5383.58 ms
+main:    total time =  5383.58 ms

Q3_K_S_log.txt ADDED Viewed

	@@ -0,0 +1,339 @@

+main: build = 3906 (7eee341b)
+main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0
+main: quantizing 'salamandra-2b_bf16.gguf' to './salamandra-2b_Q3_K_S.gguf' as Q3_K_S
+llama_model_loader: loaded meta data with 29 key-value pairs and 219 tensors from salamandra-2b_bf16.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                         general.size_label str              = 2.3B
+llama_model_loader: - kv   3:                            general.license str              = apache-2.0
+llama_model_loader: - kv   4:                               general.tags arr[str,1]       = ["text-generation"]
+llama_model_loader: - kv   5:                          general.languages arr[str,36]      = ["bg", "ca", "code", "cs", "cy", "da"...
+llama_model_loader: - kv   6:                          llama.block_count u32              = 24
+llama_model_loader: - kv   7:                       llama.context_length u32              = 8192
+llama_model_loader: - kv   8:                     llama.embedding_length u32              = 2048
+llama_model_loader: - kv   9:                  llama.feed_forward_length u32              = 5440
+llama_model_loader: - kv  10:                 llama.attention.head_count u32              = 16
+llama_model_loader: - kv  11:              llama.attention.head_count_kv u32              = 16
+llama_model_loader: - kv  12:                       llama.rope.freq_base f32              = 10000.000000
+llama_model_loader: - kv  13:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  14:                          general.file_type u32              = 32
+llama_model_loader: - kv  15:                           llama.vocab_size u32              = 256000
+llama_model_loader: - kv  16:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv  17:            tokenizer.ggml.add_space_prefix bool             = true
+llama_model_loader: - kv  18:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  19:                         tokenizer.ggml.pre str              = default
+llama_model_loader: - kv  20:                      tokenizer.ggml.tokens arr[str,256000]  = ["<unk>", "<s>", "</s>", "<pad>", "<|...
+llama_model_loader: - kv  21:                      tokenizer.ggml.scores arr[f32,256000]  = [-1000.000000, -1000.000000, -1000.00...
+llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,256000]  = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
+llama_model_loader: - kv  23:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 2
+llama_model_loader: - kv  25:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  26:               tokenizer.ggml.add_bos_token bool             = true
+llama_model_loader: - kv  27:               tokenizer.ggml.add_eos_token bool             = false
+llama_model_loader: - kv  28:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   49 tensors
+llama_model_loader: - type bf16:  170 tensors
+================================ Have weights data with 168 entries
+[   1/ 219]                        output.weight - [ 2048, 256000,     1,     1], type =   bf16, size = 1000.000 MB
+[   2/ 219]                    token_embd.weight - [ 2048, 256000,     1,     1], type =   bf16,
+====== llama_model_quantize_internal: did not find weights for token_embd.weight
+converting to q3_K .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt'
+load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks
+prepare_imatrix: have 168 importance matrix entries
+size =  1000.00 MiB ->   214.84 MiB
+[   3/ 219]               blk.0.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   4/ 219]                blk.0.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[   5/ 219]                blk.0.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[   6/ 219]                  blk.0.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[   7/ 219]                blk.0.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   8/ 219]                  blk.0.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[   9/ 219]             blk.0.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  10/ 219]                  blk.0.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  11/ 219]                  blk.0.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  12/ 219]               blk.1.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  13/ 219]                blk.1.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  14/ 219]                blk.1.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  15/ 219]                  blk.1.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  16/ 219]                blk.1.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  17/ 219]                  blk.1.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  18/ 219]             blk.1.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  19/ 219]                  blk.1.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  20/ 219]                  blk.1.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  21/ 219]              blk.10.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  22/ 219]               blk.10.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  23/ 219]               blk.10.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  24/ 219]                 blk.10.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  25/ 219]               blk.10.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  26/ 219]                 blk.10.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  27/ 219]            blk.10.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  28/ 219]                 blk.10.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  29/ 219]                 blk.10.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  30/ 219]              blk.11.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  31/ 219]               blk.11.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  32/ 219]               blk.11.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  33/ 219]                 blk.11.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  34/ 219]               blk.11.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  35/ 219]                 blk.11.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  36/ 219]            blk.11.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  37/ 219]                 blk.11.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  38/ 219]                 blk.11.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  39/ 219]              blk.12.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  40/ 219]               blk.12.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  41/ 219]               blk.12.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  42/ 219]                 blk.12.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  43/ 219]               blk.12.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  44/ 219]                 blk.12.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  45/ 219]            blk.12.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  46/ 219]                 blk.12.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  47/ 219]                 blk.12.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  48/ 219]              blk.13.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  49/ 219]               blk.13.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  50/ 219]               blk.13.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  51/ 219]                 blk.13.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  52/ 219]               blk.13.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  53/ 219]                 blk.13.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  54/ 219]            blk.13.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  55/ 219]                 blk.13.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  56/ 219]                 blk.13.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  57/ 219]              blk.14.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  58/ 219]               blk.14.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  59/ 219]               blk.14.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  60/ 219]                 blk.14.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  61/ 219]               blk.14.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  62/ 219]                 blk.14.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  63/ 219]            blk.14.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  64/ 219]                 blk.14.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  65/ 219]                 blk.14.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  66/ 219]              blk.15.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  67/ 219]               blk.15.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  68/ 219]               blk.15.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  69/ 219]                 blk.15.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  70/ 219]               blk.15.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  71/ 219]                 blk.15.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  72/ 219]            blk.15.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  73/ 219]                 blk.15.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  74/ 219]                 blk.15.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  75/ 219]              blk.16.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  76/ 219]               blk.16.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  77/ 219]               blk.16.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  78/ 219]                 blk.16.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  79/ 219]               blk.16.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  80/ 219]                 blk.16.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  81/ 219]            blk.16.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  82/ 219]                 blk.16.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  83/ 219]                 blk.16.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  84/ 219]              blk.17.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  85/ 219]               blk.17.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  86/ 219]               blk.17.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  87/ 219]                 blk.17.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  88/ 219]               blk.17.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  89/ 219]                 blk.17.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  90/ 219]            blk.17.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  91/ 219]                 blk.17.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  92/ 219]                 blk.17.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  93/ 219]              blk.18.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  94/ 219]               blk.18.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[  95/ 219]               blk.18.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  96/ 219]                 blk.18.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[  97/ 219]               blk.18.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  98/ 219]                 blk.18.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[  99/ 219]            blk.18.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 100/ 219]                 blk.18.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 101/ 219]                 blk.18.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 102/ 219]              blk.19.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 103/ 219]               blk.19.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 104/ 219]               blk.19.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 105/ 219]                 blk.19.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 106/ 219]               blk.19.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 107/ 219]                 blk.19.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 108/ 219]            blk.19.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 109/ 219]                 blk.19.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 110/ 219]                 blk.19.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 111/ 219]               blk.2.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 112/ 219]                blk.2.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 113/ 219]                blk.2.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 114/ 219]                  blk.2.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 115/ 219]                blk.2.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 116/ 219]                  blk.2.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 117/ 219]             blk.2.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 118/ 219]                  blk.2.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 119/ 219]                  blk.2.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 120/ 219]              blk.20.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 121/ 219]               blk.20.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 122/ 219]               blk.20.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 123/ 219]                 blk.20.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 124/ 219]               blk.20.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 125/ 219]                 blk.20.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 126/ 219]            blk.20.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 127/ 219]                 blk.20.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 128/ 219]                 blk.20.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 129/ 219]              blk.21.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 130/ 219]               blk.21.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 131/ 219]               blk.21.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 132/ 219]                 blk.21.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 133/ 219]               blk.21.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 134/ 219]                 blk.21.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 135/ 219]            blk.21.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 136/ 219]                 blk.21.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 137/ 219]                 blk.21.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 138/ 219]              blk.22.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 139/ 219]               blk.22.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 140/ 219]               blk.22.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 141/ 219]                 blk.22.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 142/ 219]               blk.22.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 143/ 219]                 blk.22.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 144/ 219]            blk.22.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 145/ 219]                 blk.22.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 146/ 219]                 blk.22.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 147/ 219]              blk.23.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 148/ 219]               blk.23.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 149/ 219]               blk.23.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 150/ 219]                 blk.23.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 151/ 219]               blk.23.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 152/ 219]                 blk.23.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 153/ 219]            blk.23.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 154/ 219]                 blk.23.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 155/ 219]                 blk.23.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 156/ 219]               blk.3.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 157/ 219]                blk.3.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 158/ 219]                blk.3.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 159/ 219]                  blk.3.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 160/ 219]                blk.3.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 161/ 219]                  blk.3.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 162/ 219]             blk.3.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 163/ 219]                  blk.3.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 164/ 219]                  blk.3.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 165/ 219]               blk.4.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 166/ 219]                blk.4.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 167/ 219]                blk.4.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 168/ 219]                  blk.4.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 169/ 219]                blk.4.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 170/ 219]                  blk.4.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 171/ 219]             blk.4.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 172/ 219]                  blk.4.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 173/ 219]                  blk.4.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 174/ 219]               blk.5.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 175/ 219]                blk.5.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 176/ 219]                blk.5.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 177/ 219]                  blk.5.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 178/ 219]                blk.5.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 179/ 219]                  blk.5.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 180/ 219]             blk.5.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 181/ 219]                  blk.5.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 182/ 219]                  blk.5.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 183/ 219]               blk.6.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 184/ 219]                blk.6.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 185/ 219]                blk.6.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 186/ 219]                  blk.6.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 187/ 219]                blk.6.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 188/ 219]                  blk.6.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 189/ 219]             blk.6.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 190/ 219]                  blk.6.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 191/ 219]                  blk.6.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 192/ 219]               blk.7.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 193/ 219]                blk.7.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 194/ 219]                blk.7.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 195/ 219]                  blk.7.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 196/ 219]                blk.7.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 197/ 219]                  blk.7.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 198/ 219]             blk.7.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 199/ 219]                  blk.7.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 200/ 219]                  blk.7.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 201/ 219]               blk.8.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 202/ 219]                blk.8.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 203/ 219]                blk.8.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 204/ 219]                  blk.8.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 205/ 219]                blk.8.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 206/ 219]                  blk.8.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 207/ 219]             blk.8.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 208/ 219]                  blk.8.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 209/ 219]                  blk.8.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 210/ 219]               blk.9.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 211/ 219]                blk.9.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q3_K - using fallback quantization iq4_nl
+converting to iq4_nl .. size =    21.25 MiB ->     5.98 MiB
+[ 212/ 219]                blk.9.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 213/ 219]                  blk.9.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q3_K .. size =    21.25 MiB ->     4.57 MiB
+[ 214/ 219]                blk.9.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 215/ 219]                  blk.9.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 216/ 219]             blk.9.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 217/ 219]                  blk.9.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 218/ 219]                  blk.9.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q3_K .. size =     8.00 MiB ->     1.72 MiB
+[ 219/ 219]                   output_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+llama_model_quantize_internal: model size  =  4298.38 MB
+llama_model_quantize_internal: quant size  =  1742.80 MB
+llama_model_quantize_internal: WARNING: 24 of 169 tensor(s) required fallback quantization
+main: quantize time =  6724.72 ms
+main:    total time =  6724.72 ms

Q4_K_M_log.txt ADDED Viewed

	@@ -0,0 +1,339 @@

+main: build = 3906 (7eee341b)
+main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0
+main: quantizing 'salamandra-2b_bf16.gguf' to './salamandra-2b_Q4_K_M.gguf' as Q4_K_M
+llama_model_loader: loaded meta data with 29 key-value pairs and 219 tensors from salamandra-2b_bf16.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                         general.size_label str              = 2.3B
+llama_model_loader: - kv   3:                            general.license str              = apache-2.0
+llama_model_loader: - kv   4:                               general.tags arr[str,1]       = ["text-generation"]
+llama_model_loader: - kv   5:                          general.languages arr[str,36]      = ["bg", "ca", "code", "cs", "cy", "da"...
+llama_model_loader: - kv   6:                          llama.block_count u32              = 24
+llama_model_loader: - kv   7:                       llama.context_length u32              = 8192
+llama_model_loader: - kv   8:                     llama.embedding_length u32              = 2048
+llama_model_loader: - kv   9:                  llama.feed_forward_length u32              = 5440
+llama_model_loader: - kv  10:                 llama.attention.head_count u32              = 16
+llama_model_loader: - kv  11:              llama.attention.head_count_kv u32              = 16
+llama_model_loader: - kv  12:                       llama.rope.freq_base f32              = 10000.000000
+llama_model_loader: - kv  13:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  14:                          general.file_type u32              = 32
+llama_model_loader: - kv  15:                           llama.vocab_size u32              = 256000
+llama_model_loader: - kv  16:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv  17:            tokenizer.ggml.add_space_prefix bool             = true
+llama_model_loader: - kv  18:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  19:                         tokenizer.ggml.pre str              = default
+llama_model_loader: - kv  20:                      tokenizer.ggml.tokens arr[str,256000]  = ["<unk>", "<s>", "</s>", "<pad>", "<|...
+llama_model_loader: - kv  21:                      tokenizer.ggml.scores arr[f32,256000]  = [-1000.000000, -1000.000000, -1000.00...
+llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,256000]  = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
+llama_model_loader: - kv  23:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 2
+llama_model_loader: - kv  25:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  26:               tokenizer.ggml.add_bos_token bool             = true
+llama_model_loader: - kv  27:               tokenizer.ggml.add_eos_token bool             = false
+llama_model_loader: - kv  28:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   49 tensors
+llama_model_loader: - type bf16:  170 tensors
+================================ Have weights data with 168 entries
+[   1/ 219]                        output.weight - [ 2048, 256000,     1,     1], type =   bf16, size = 1000.000 MB
+[   2/ 219]                    token_embd.weight - [ 2048, 256000,     1,     1], type =   bf16,
+====== llama_model_quantize_internal: did not find weights for token_embd.weight
+converting to q4_K .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt'
+load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks
+prepare_imatrix: have 168 importance matrix entries
+size =  1000.00 MiB ->   281.25 MiB
+[   3/ 219]               blk.0.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   4/ 219]                blk.0.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[   5/ 219]                blk.0.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[   6/ 219]                  blk.0.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[   7/ 219]                blk.0.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   8/ 219]                  blk.0.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[   9/ 219]             blk.0.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  10/ 219]                  blk.0.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  11/ 219]                  blk.0.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  12/ 219]               blk.1.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  13/ 219]                blk.1.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  14/ 219]                blk.1.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  15/ 219]                  blk.1.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  16/ 219]                blk.1.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  17/ 219]                  blk.1.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  18/ 219]             blk.1.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  19/ 219]                  blk.1.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  20/ 219]                  blk.1.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  21/ 219]              blk.10.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  22/ 219]               blk.10.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  23/ 219]               blk.10.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  24/ 219]                 blk.10.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  25/ 219]               blk.10.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  26/ 219]                 blk.10.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  27/ 219]            blk.10.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  28/ 219]                 blk.10.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  29/ 219]                 blk.10.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  30/ 219]              blk.11.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  31/ 219]               blk.11.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  32/ 219]               blk.11.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  33/ 219]                 blk.11.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  34/ 219]               blk.11.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  35/ 219]                 blk.11.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  36/ 219]            blk.11.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  37/ 219]                 blk.11.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  38/ 219]                 blk.11.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  39/ 219]              blk.12.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  40/ 219]               blk.12.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  41/ 219]               blk.12.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  42/ 219]                 blk.12.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  43/ 219]               blk.12.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  44/ 219]                 blk.12.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  45/ 219]            blk.12.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  46/ 219]                 blk.12.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  47/ 219]                 blk.12.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  48/ 219]              blk.13.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  49/ 219]               blk.13.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  50/ 219]               blk.13.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  51/ 219]                 blk.13.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  52/ 219]               blk.13.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  53/ 219]                 blk.13.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  54/ 219]            blk.13.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  55/ 219]                 blk.13.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  56/ 219]                 blk.13.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  57/ 219]              blk.14.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  58/ 219]               blk.14.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  59/ 219]               blk.14.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  60/ 219]                 blk.14.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  61/ 219]               blk.14.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  62/ 219]                 blk.14.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  63/ 219]            blk.14.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  64/ 219]                 blk.14.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  65/ 219]                 blk.14.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  66/ 219]              blk.15.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  67/ 219]               blk.15.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  68/ 219]               blk.15.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  69/ 219]                 blk.15.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  70/ 219]               blk.15.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  71/ 219]                 blk.15.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  72/ 219]            blk.15.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  73/ 219]                 blk.15.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  74/ 219]                 blk.15.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  75/ 219]              blk.16.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  76/ 219]               blk.16.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  77/ 219]               blk.16.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  78/ 219]                 blk.16.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  79/ 219]               blk.16.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  80/ 219]                 blk.16.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  81/ 219]            blk.16.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  82/ 219]                 blk.16.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  83/ 219]                 blk.16.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  84/ 219]              blk.17.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  85/ 219]               blk.17.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  86/ 219]               blk.17.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  87/ 219]                 blk.17.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  88/ 219]               blk.17.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  89/ 219]                 blk.17.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  90/ 219]            blk.17.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  91/ 219]                 blk.17.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  92/ 219]                 blk.17.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  93/ 219]              blk.18.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  94/ 219]               blk.18.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  95/ 219]               blk.18.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  96/ 219]                 blk.18.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  97/ 219]               blk.18.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  98/ 219]                 blk.18.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  99/ 219]            blk.18.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 100/ 219]                 blk.18.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 101/ 219]                 blk.18.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 102/ 219]              blk.19.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 103/ 219]               blk.19.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 104/ 219]               blk.19.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 105/ 219]                 blk.19.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 106/ 219]               blk.19.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 107/ 219]                 blk.19.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 108/ 219]            blk.19.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 109/ 219]                 blk.19.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 110/ 219]                 blk.19.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 111/ 219]               blk.2.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 112/ 219]                blk.2.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 113/ 219]                blk.2.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 114/ 219]                  blk.2.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 115/ 219]                blk.2.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 116/ 219]                  blk.2.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 117/ 219]             blk.2.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 118/ 219]                  blk.2.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 119/ 219]                  blk.2.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 120/ 219]              blk.20.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 121/ 219]               blk.20.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 122/ 219]               blk.20.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 123/ 219]                 blk.20.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 124/ 219]               blk.20.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 125/ 219]                 blk.20.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 126/ 219]            blk.20.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 127/ 219]                 blk.20.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 128/ 219]                 blk.20.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 129/ 219]              blk.21.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 130/ 219]               blk.21.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 131/ 219]               blk.21.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 132/ 219]                 blk.21.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 133/ 219]               blk.21.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 134/ 219]                 blk.21.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 135/ 219]            blk.21.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 136/ 219]                 blk.21.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 137/ 219]                 blk.21.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 138/ 219]              blk.22.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 139/ 219]               blk.22.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 140/ 219]               blk.22.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 141/ 219]                 blk.22.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 142/ 219]               blk.22.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 143/ 219]                 blk.22.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 144/ 219]            blk.22.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 145/ 219]                 blk.22.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 146/ 219]                 blk.22.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 147/ 219]              blk.23.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 148/ 219]               blk.23.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 149/ 219]               blk.23.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 150/ 219]                 blk.23.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 151/ 219]               blk.23.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 152/ 219]                 blk.23.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 153/ 219]            blk.23.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 154/ 219]                 blk.23.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 155/ 219]                 blk.23.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 156/ 219]               blk.3.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 157/ 219]                blk.3.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 158/ 219]                blk.3.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 159/ 219]                  blk.3.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 160/ 219]                blk.3.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 161/ 219]                  blk.3.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 162/ 219]             blk.3.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 163/ 219]                  blk.3.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 164/ 219]                  blk.3.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 165/ 219]               blk.4.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 166/ 219]                blk.4.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 167/ 219]                blk.4.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 168/ 219]                  blk.4.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 169/ 219]                blk.4.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 170/ 219]                  blk.4.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 171/ 219]             blk.4.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 172/ 219]                  blk.4.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 173/ 219]                  blk.4.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 174/ 219]               blk.5.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 175/ 219]                blk.5.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 176/ 219]                blk.5.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 177/ 219]                  blk.5.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 178/ 219]                blk.5.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 179/ 219]                  blk.5.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 180/ 219]             blk.5.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 181/ 219]                  blk.5.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 182/ 219]                  blk.5.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 183/ 219]               blk.6.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 184/ 219]                blk.6.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 185/ 219]                blk.6.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 186/ 219]                  blk.6.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 187/ 219]                blk.6.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 188/ 219]                  blk.6.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 189/ 219]             blk.6.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 190/ 219]                  blk.6.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 191/ 219]                  blk.6.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 192/ 219]               blk.7.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 193/ 219]                blk.7.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 194/ 219]                blk.7.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 195/ 219]                  blk.7.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 196/ 219]                blk.7.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 197/ 219]                  blk.7.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 198/ 219]             blk.7.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 199/ 219]                  blk.7.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 200/ 219]                  blk.7.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 201/ 219]               blk.8.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 202/ 219]                blk.8.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 203/ 219]                blk.8.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 204/ 219]                  blk.8.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 205/ 219]                blk.8.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 206/ 219]                  blk.8.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 207/ 219]             blk.8.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 208/ 219]                  blk.8.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 209/ 219]                  blk.8.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 210/ 219]               blk.9.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 211/ 219]                blk.9.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 212/ 219]                blk.9.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 213/ 219]                  blk.9.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 214/ 219]                blk.9.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 215/ 219]                  blk.9.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 216/ 219]             blk.9.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 217/ 219]                  blk.9.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 218/ 219]                  blk.9.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 219/ 219]                   output_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+llama_model_quantize_internal: model size  =  4298.38 MB
+llama_model_quantize_internal: quant size  =  2020.01 MB
+llama_model_quantize_internal: WARNING: 24 of 169 tensor(s) required fallback quantization
+main: quantize time =  8902.98 ms
+main:    total time =  8902.98 ms

Q4_K_S_log.txt ADDED Viewed

	@@ -0,0 +1,339 @@

+main: build = 3906 (7eee341b)
+main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0
+main: quantizing 'salamandra-2b_bf16.gguf' to './salamandra-2b_Q4_K_S.gguf' as Q4_K_S
+llama_model_loader: loaded meta data with 29 key-value pairs and 219 tensors from salamandra-2b_bf16.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                         general.size_label str              = 2.3B
+llama_model_loader: - kv   3:                            general.license str              = apache-2.0
+llama_model_loader: - kv   4:                               general.tags arr[str,1]       = ["text-generation"]
+llama_model_loader: - kv   5:                          general.languages arr[str,36]      = ["bg", "ca", "code", "cs", "cy", "da"...
+llama_model_loader: - kv   6:                          llama.block_count u32              = 24
+llama_model_loader: - kv   7:                       llama.context_length u32              = 8192
+llama_model_loader: - kv   8:                     llama.embedding_length u32              = 2048
+llama_model_loader: - kv   9:                  llama.feed_forward_length u32              = 5440
+llama_model_loader: - kv  10:                 llama.attention.head_count u32              = 16
+llama_model_loader: - kv  11:              llama.attention.head_count_kv u32              = 16
+llama_model_loader: - kv  12:                       llama.rope.freq_base f32              = 10000.000000
+llama_model_loader: - kv  13:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  14:                          general.file_type u32              = 32
+llama_model_loader: - kv  15:                           llama.vocab_size u32              = 256000
+llama_model_loader: - kv  16:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv  17:            tokenizer.ggml.add_space_prefix bool             = true
+llama_model_loader: - kv  18:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  19:                         tokenizer.ggml.pre str              = default
+llama_model_loader: - kv  20:                      tokenizer.ggml.tokens arr[str,256000]  = ["<unk>", "<s>", "</s>", "<pad>", "<|...
+llama_model_loader: - kv  21:                      tokenizer.ggml.scores arr[f32,256000]  = [-1000.000000, -1000.000000, -1000.00...
+llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,256000]  = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
+llama_model_loader: - kv  23:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 2
+llama_model_loader: - kv  25:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  26:               tokenizer.ggml.add_bos_token bool             = true
+llama_model_loader: - kv  27:               tokenizer.ggml.add_eos_token bool             = false
+llama_model_loader: - kv  28:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   49 tensors
+llama_model_loader: - type bf16:  170 tensors
+================================ Have weights data with 168 entries
+[   1/ 219]                        output.weight - [ 2048, 256000,     1,     1], type =   bf16, size = 1000.000 MB
+[   2/ 219]                    token_embd.weight - [ 2048, 256000,     1,     1], type =   bf16,
+====== llama_model_quantize_internal: did not find weights for token_embd.weight
+converting to q4_K .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt'
+load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks
+prepare_imatrix: have 168 importance matrix entries
+size =  1000.00 MiB ->   281.25 MiB
+[   3/ 219]               blk.0.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   4/ 219]                blk.0.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[   5/ 219]                blk.0.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[   6/ 219]                  blk.0.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[   7/ 219]                blk.0.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   8/ 219]                  blk.0.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[   9/ 219]             blk.0.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  10/ 219]                  blk.0.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  11/ 219]                  blk.0.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  12/ 219]               blk.1.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  13/ 219]                blk.1.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  14/ 219]                blk.1.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  15/ 219]                  blk.1.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  16/ 219]                blk.1.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  17/ 219]                  blk.1.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  18/ 219]             blk.1.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  19/ 219]                  blk.1.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  20/ 219]                  blk.1.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  21/ 219]              blk.10.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  22/ 219]               blk.10.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  23/ 219]               blk.10.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  24/ 219]                 blk.10.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  25/ 219]               blk.10.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  26/ 219]                 blk.10.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  27/ 219]            blk.10.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  28/ 219]                 blk.10.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  29/ 219]                 blk.10.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  30/ 219]              blk.11.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  31/ 219]               blk.11.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  32/ 219]               blk.11.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  33/ 219]                 blk.11.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  34/ 219]               blk.11.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  35/ 219]                 blk.11.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  36/ 219]            blk.11.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  37/ 219]                 blk.11.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  38/ 219]                 blk.11.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  39/ 219]              blk.12.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  40/ 219]               blk.12.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  41/ 219]               blk.12.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  42/ 219]                 blk.12.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  43/ 219]               blk.12.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  44/ 219]                 blk.12.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  45/ 219]            blk.12.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  46/ 219]                 blk.12.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  47/ 219]                 blk.12.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  48/ 219]              blk.13.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  49/ 219]               blk.13.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  50/ 219]               blk.13.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  51/ 219]                 blk.13.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  52/ 219]               blk.13.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  53/ 219]                 blk.13.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  54/ 219]            blk.13.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  55/ 219]                 blk.13.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  56/ 219]                 blk.13.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  57/ 219]              blk.14.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  58/ 219]               blk.14.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  59/ 219]               blk.14.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  60/ 219]                 blk.14.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  61/ 219]               blk.14.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  62/ 219]                 blk.14.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  63/ 219]            blk.14.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  64/ 219]                 blk.14.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  65/ 219]                 blk.14.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  66/ 219]              blk.15.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  67/ 219]               blk.15.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  68/ 219]               blk.15.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  69/ 219]                 blk.15.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  70/ 219]               blk.15.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  71/ 219]                 blk.15.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  72/ 219]            blk.15.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  73/ 219]                 blk.15.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  74/ 219]                 blk.15.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  75/ 219]              blk.16.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  76/ 219]               blk.16.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  77/ 219]               blk.16.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  78/ 219]                 blk.16.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  79/ 219]               blk.16.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  80/ 219]                 blk.16.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  81/ 219]            blk.16.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  82/ 219]                 blk.16.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  83/ 219]                 blk.16.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  84/ 219]              blk.17.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  85/ 219]               blk.17.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  86/ 219]               blk.17.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  87/ 219]                 blk.17.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  88/ 219]               blk.17.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  89/ 219]                 blk.17.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  90/ 219]            blk.17.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  91/ 219]                 blk.17.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  92/ 219]                 blk.17.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  93/ 219]              blk.18.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  94/ 219]               blk.18.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[  95/ 219]               blk.18.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  96/ 219]                 blk.18.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[  97/ 219]               blk.18.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  98/ 219]                 blk.18.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[  99/ 219]            blk.18.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 100/ 219]                 blk.18.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 101/ 219]                 blk.18.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 102/ 219]              blk.19.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 103/ 219]               blk.19.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 104/ 219]               blk.19.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 105/ 219]                 blk.19.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 106/ 219]               blk.19.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 107/ 219]                 blk.19.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 108/ 219]            blk.19.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 109/ 219]                 blk.19.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 110/ 219]                 blk.19.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 111/ 219]               blk.2.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 112/ 219]                blk.2.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 113/ 219]                blk.2.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 114/ 219]                  blk.2.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 115/ 219]                blk.2.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 116/ 219]                  blk.2.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 117/ 219]             blk.2.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 118/ 219]                  blk.2.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 119/ 219]                  blk.2.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 120/ 219]              blk.20.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 121/ 219]               blk.20.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 122/ 219]               blk.20.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 123/ 219]                 blk.20.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 124/ 219]               blk.20.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 125/ 219]                 blk.20.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 126/ 219]            blk.20.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 127/ 219]                 blk.20.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 128/ 219]                 blk.20.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 129/ 219]              blk.21.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 130/ 219]               blk.21.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 131/ 219]               blk.21.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 132/ 219]                 blk.21.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 133/ 219]               blk.21.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 134/ 219]                 blk.21.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 135/ 219]            blk.21.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 136/ 219]                 blk.21.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 137/ 219]                 blk.21.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 138/ 219]              blk.22.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 139/ 219]               blk.22.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 140/ 219]               blk.22.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 141/ 219]                 blk.22.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 142/ 219]               blk.22.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 143/ 219]                 blk.22.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 144/ 219]            blk.22.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 145/ 219]                 blk.22.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 146/ 219]                 blk.22.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 147/ 219]              blk.23.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 148/ 219]               blk.23.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 149/ 219]               blk.23.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 150/ 219]                 blk.23.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 151/ 219]               blk.23.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 152/ 219]                 blk.23.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 153/ 219]            blk.23.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 154/ 219]                 blk.23.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 155/ 219]                 blk.23.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 156/ 219]               blk.3.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 157/ 219]                blk.3.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 158/ 219]                blk.3.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 159/ 219]                  blk.3.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 160/ 219]                blk.3.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 161/ 219]                  blk.3.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 162/ 219]             blk.3.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 163/ 219]                  blk.3.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 164/ 219]                  blk.3.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 165/ 219]               blk.4.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 166/ 219]                blk.4.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 167/ 219]                blk.4.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 168/ 219]                  blk.4.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 169/ 219]                blk.4.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 170/ 219]                  blk.4.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 171/ 219]             blk.4.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 172/ 219]                  blk.4.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 173/ 219]                  blk.4.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 174/ 219]               blk.5.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 175/ 219]                blk.5.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 176/ 219]                blk.5.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 177/ 219]                  blk.5.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 178/ 219]                blk.5.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 179/ 219]                  blk.5.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 180/ 219]             blk.5.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 181/ 219]                  blk.5.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 182/ 219]                  blk.5.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 183/ 219]               blk.6.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 184/ 219]                blk.6.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 185/ 219]                blk.6.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 186/ 219]                  blk.6.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 187/ 219]                blk.6.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 188/ 219]                  blk.6.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 189/ 219]             blk.6.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 190/ 219]                  blk.6.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 191/ 219]                  blk.6.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 192/ 219]               blk.7.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 193/ 219]                blk.7.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 194/ 219]                blk.7.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 195/ 219]                  blk.7.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 196/ 219]                blk.7.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 197/ 219]                  blk.7.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 198/ 219]             blk.7.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 199/ 219]                  blk.7.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 200/ 219]                  blk.7.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 201/ 219]               blk.8.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 202/ 219]                blk.8.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 203/ 219]                blk.8.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 204/ 219]                  blk.8.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 205/ 219]                blk.8.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 206/ 219]                  blk.8.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 207/ 219]             blk.8.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 208/ 219]                  blk.8.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 209/ 219]                  blk.8.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 210/ 219]               blk.9.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 211/ 219]                blk.9.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q4_K - using fallback quantization q5_0
+converting to q5_0 .. size =    21.25 MiB ->     7.30 MiB
+[ 212/ 219]                blk.9.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 213/ 219]                  blk.9.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q4_K .. size =    21.25 MiB ->     5.98 MiB
+[ 214/ 219]                blk.9.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 215/ 219]                  blk.9.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 216/ 219]             blk.9.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 217/ 219]                  blk.9.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 218/ 219]                  blk.9.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB
+[ 219/ 219]                   output_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+llama_model_quantize_internal: model size  =  4298.38 MB
+llama_model_quantize_internal: quant size  =  1963.81 MB
+llama_model_quantize_internal: WARNING: 24 of 169 tensor(s) required fallback quantization
+main: quantize time =  9350.38 ms
+main:    total time =  9350.38 ms

Q5_K_M_log.txt ADDED Viewed

	@@ -0,0 +1,339 @@

+main: build = 3906 (7eee341b)
+main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0
+main: quantizing 'salamandra-2b_bf16.gguf' to './salamandra-2b_Q5_K_M.gguf' as Q5_K_M
+llama_model_loader: loaded meta data with 29 key-value pairs and 219 tensors from salamandra-2b_bf16.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                         general.size_label str              = 2.3B
+llama_model_loader: - kv   3:                            general.license str              = apache-2.0
+llama_model_loader: - kv   4:                               general.tags arr[str,1]       = ["text-generation"]
+llama_model_loader: - kv   5:                          general.languages arr[str,36]      = ["bg", "ca", "code", "cs", "cy", "da"...
+llama_model_loader: - kv   6:                          llama.block_count u32              = 24
+llama_model_loader: - kv   7:                       llama.context_length u32              = 8192
+llama_model_loader: - kv   8:                     llama.embedding_length u32              = 2048
+llama_model_loader: - kv   9:                  llama.feed_forward_length u32              = 5440
+llama_model_loader: - kv  10:                 llama.attention.head_count u32              = 16
+llama_model_loader: - kv  11:              llama.attention.head_count_kv u32              = 16
+llama_model_loader: - kv  12:                       llama.rope.freq_base f32              = 10000.000000
+llama_model_loader: - kv  13:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  14:                          general.file_type u32              = 32
+llama_model_loader: - kv  15:                           llama.vocab_size u32              = 256000
+llama_model_loader: - kv  16:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv  17:            tokenizer.ggml.add_space_prefix bool             = true
+llama_model_loader: - kv  18:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  19:                         tokenizer.ggml.pre str              = default
+llama_model_loader: - kv  20:                      tokenizer.ggml.tokens arr[str,256000]  = ["<unk>", "<s>", "</s>", "<pad>", "<|...
+llama_model_loader: - kv  21:                      tokenizer.ggml.scores arr[f32,256000]  = [-1000.000000, -1000.000000, -1000.00...
+llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,256000]  = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
+llama_model_loader: - kv  23:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 2
+llama_model_loader: - kv  25:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  26:               tokenizer.ggml.add_bos_token bool             = true
+llama_model_loader: - kv  27:               tokenizer.ggml.add_eos_token bool             = false
+llama_model_loader: - kv  28:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   49 tensors
+llama_model_loader: - type bf16:  170 tensors
+================================ Have weights data with 168 entries
+[   1/ 219]                        output.weight - [ 2048, 256000,     1,     1], type =   bf16, size = 1000.000 MB
+[   2/ 219]                    token_embd.weight - [ 2048, 256000,     1,     1], type =   bf16,
+====== llama_model_quantize_internal: did not find weights for token_embd.weight
+converting to q5_K .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt'
+load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks
+prepare_imatrix: have 168 importance matrix entries
+size =  1000.00 MiB ->   343.75 MiB
+[   3/ 219]               blk.0.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   4/ 219]                blk.0.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[   5/ 219]                blk.0.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[   6/ 219]                  blk.0.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[   7/ 219]                blk.0.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   8/ 219]                  blk.0.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[   9/ 219]             blk.0.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  10/ 219]                  blk.0.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  11/ 219]                  blk.0.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  12/ 219]               blk.1.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  13/ 219]                blk.1.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  14/ 219]                blk.1.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  15/ 219]                  blk.1.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  16/ 219]                blk.1.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  17/ 219]                  blk.1.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  18/ 219]             blk.1.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  19/ 219]                  blk.1.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  20/ 219]                  blk.1.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  21/ 219]              blk.10.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  22/ 219]               blk.10.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  23/ 219]               blk.10.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  24/ 219]                 blk.10.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  25/ 219]               blk.10.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  26/ 219]                 blk.10.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  27/ 219]            blk.10.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  28/ 219]                 blk.10.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  29/ 219]                 blk.10.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  30/ 219]              blk.11.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  31/ 219]               blk.11.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  32/ 219]               blk.11.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  33/ 219]                 blk.11.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  34/ 219]               blk.11.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  35/ 219]                 blk.11.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  36/ 219]            blk.11.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  37/ 219]                 blk.11.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  38/ 219]                 blk.11.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  39/ 219]              blk.12.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  40/ 219]               blk.12.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  41/ 219]               blk.12.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  42/ 219]                 blk.12.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  43/ 219]               blk.12.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  44/ 219]                 blk.12.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  45/ 219]            blk.12.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  46/ 219]                 blk.12.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  47/ 219]                 blk.12.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  48/ 219]              blk.13.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  49/ 219]               blk.13.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  50/ 219]               blk.13.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  51/ 219]                 blk.13.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  52/ 219]               blk.13.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  53/ 219]                 blk.13.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  54/ 219]            blk.13.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  55/ 219]                 blk.13.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  56/ 219]                 blk.13.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  57/ 219]              blk.14.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  58/ 219]               blk.14.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  59/ 219]               blk.14.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  60/ 219]                 blk.14.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  61/ 219]               blk.14.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  62/ 219]                 blk.14.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  63/ 219]            blk.14.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  64/ 219]                 blk.14.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  65/ 219]                 blk.14.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  66/ 219]              blk.15.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  67/ 219]               blk.15.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  68/ 219]               blk.15.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  69/ 219]                 blk.15.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  70/ 219]               blk.15.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  71/ 219]                 blk.15.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  72/ 219]            blk.15.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  73/ 219]                 blk.15.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  74/ 219]                 blk.15.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  75/ 219]              blk.16.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  76/ 219]               blk.16.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  77/ 219]               blk.16.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  78/ 219]                 blk.16.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  79/ 219]               blk.16.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  80/ 219]                 blk.16.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  81/ 219]            blk.16.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  82/ 219]                 blk.16.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  83/ 219]                 blk.16.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  84/ 219]              blk.17.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  85/ 219]               blk.17.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  86/ 219]               blk.17.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  87/ 219]                 blk.17.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  88/ 219]               blk.17.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  89/ 219]                 blk.17.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  90/ 219]            blk.17.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  91/ 219]                 blk.17.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  92/ 219]                 blk.17.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  93/ 219]              blk.18.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  94/ 219]               blk.18.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  95/ 219]               blk.18.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  96/ 219]                 blk.18.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  97/ 219]               blk.18.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  98/ 219]                 blk.18.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  99/ 219]            blk.18.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 100/ 219]                 blk.18.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 101/ 219]                 blk.18.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 102/ 219]              blk.19.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 103/ 219]               blk.19.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 104/ 219]               blk.19.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 105/ 219]                 blk.19.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 106/ 219]               blk.19.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 107/ 219]                 blk.19.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 108/ 219]            blk.19.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 109/ 219]                 blk.19.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 110/ 219]                 blk.19.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 111/ 219]               blk.2.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 112/ 219]                blk.2.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 113/ 219]                blk.2.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 114/ 219]                  blk.2.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 115/ 219]                blk.2.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 116/ 219]                  blk.2.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 117/ 219]             blk.2.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 118/ 219]                  blk.2.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 119/ 219]                  blk.2.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 120/ 219]              blk.20.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 121/ 219]               blk.20.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 122/ 219]               blk.20.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 123/ 219]                 blk.20.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 124/ 219]               blk.20.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 125/ 219]                 blk.20.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 126/ 219]            blk.20.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 127/ 219]                 blk.20.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 128/ 219]                 blk.20.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 129/ 219]              blk.21.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 130/ 219]               blk.21.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 131/ 219]               blk.21.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 132/ 219]                 blk.21.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 133/ 219]               blk.21.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 134/ 219]                 blk.21.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 135/ 219]            blk.21.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 136/ 219]                 blk.21.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 137/ 219]                 blk.21.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 138/ 219]              blk.22.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 139/ 219]               blk.22.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 140/ 219]               blk.22.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 141/ 219]                 blk.22.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 142/ 219]               blk.22.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 143/ 219]                 blk.22.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 144/ 219]            blk.22.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 145/ 219]                 blk.22.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 146/ 219]                 blk.22.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 147/ 219]              blk.23.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 148/ 219]               blk.23.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 149/ 219]               blk.23.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 150/ 219]                 blk.23.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 151/ 219]               blk.23.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 152/ 219]                 blk.23.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 153/ 219]            blk.23.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 154/ 219]                 blk.23.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 155/ 219]                 blk.23.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 156/ 219]               blk.3.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 157/ 219]                blk.3.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 158/ 219]                blk.3.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 159/ 219]                  blk.3.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 160/ 219]                blk.3.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 161/ 219]                  blk.3.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 162/ 219]             blk.3.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 163/ 219]                  blk.3.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 164/ 219]                  blk.3.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 165/ 219]               blk.4.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 166/ 219]                blk.4.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 167/ 219]                blk.4.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 168/ 219]                  blk.4.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 169/ 219]                blk.4.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 170/ 219]                  blk.4.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 171/ 219]             blk.4.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 172/ 219]                  blk.4.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 173/ 219]                  blk.4.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 174/ 219]               blk.5.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 175/ 219]                blk.5.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 176/ 219]                blk.5.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 177/ 219]                  blk.5.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 178/ 219]                blk.5.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 179/ 219]                  blk.5.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 180/ 219]             blk.5.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 181/ 219]                  blk.5.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 182/ 219]                  blk.5.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 183/ 219]               blk.6.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 184/ 219]                blk.6.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 185/ 219]                blk.6.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 186/ 219]                  blk.6.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 187/ 219]                blk.6.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 188/ 219]                  blk.6.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 189/ 219]             blk.6.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 190/ 219]                  blk.6.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 191/ 219]                  blk.6.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 192/ 219]               blk.7.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 193/ 219]                blk.7.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 194/ 219]                blk.7.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 195/ 219]                  blk.7.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 196/ 219]                blk.7.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 197/ 219]                  blk.7.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 198/ 219]             blk.7.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 199/ 219]                  blk.7.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 200/ 219]                  blk.7.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 201/ 219]               blk.8.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 202/ 219]                blk.8.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 203/ 219]                blk.8.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 204/ 219]                  blk.8.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 205/ 219]                blk.8.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 206/ 219]                  blk.8.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 207/ 219]             blk.8.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 208/ 219]                  blk.8.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 209/ 219]                  blk.8.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 210/ 219]               blk.9.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 211/ 219]                blk.9.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 212/ 219]                blk.9.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 213/ 219]                  blk.9.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 214/ 219]                blk.9.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 215/ 219]                  blk.9.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 216/ 219]             blk.9.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 217/ 219]                  blk.9.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 218/ 219]                  blk.9.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 219/ 219]                   output_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+llama_model_quantize_internal: model size  =  4298.38 MB
+llama_model_quantize_internal: quant size  =  2196.23 MB
+llama_model_quantize_internal: WARNING: 24 of 169 tensor(s) required fallback quantization
+main: quantize time =  9522.94 ms
+main:    total time =  9522.94 ms

Q5_K_S_log.txt ADDED Viewed

	@@ -0,0 +1,339 @@

+main: build = 3906 (7eee341b)
+main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0
+main: quantizing 'salamandra-2b_bf16.gguf' to './salamandra-2b_Q5_K_S.gguf' as Q5_K_S
+llama_model_loader: loaded meta data with 29 key-value pairs and 219 tensors from salamandra-2b_bf16.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                         general.size_label str              = 2.3B
+llama_model_loader: - kv   3:                            general.license str              = apache-2.0
+llama_model_loader: - kv   4:                               general.tags arr[str,1]       = ["text-generation"]
+llama_model_loader: - kv   5:                          general.languages arr[str,36]      = ["bg", "ca", "code", "cs", "cy", "da"...
+llama_model_loader: - kv   6:                          llama.block_count u32              = 24
+llama_model_loader: - kv   7:                       llama.context_length u32              = 8192
+llama_model_loader: - kv   8:                     llama.embedding_length u32              = 2048
+llama_model_loader: - kv   9:                  llama.feed_forward_length u32              = 5440
+llama_model_loader: - kv  10:                 llama.attention.head_count u32              = 16
+llama_model_loader: - kv  11:              llama.attention.head_count_kv u32              = 16
+llama_model_loader: - kv  12:                       llama.rope.freq_base f32              = 10000.000000
+llama_model_loader: - kv  13:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  14:                          general.file_type u32              = 32
+llama_model_loader: - kv  15:                           llama.vocab_size u32              = 256000
+llama_model_loader: - kv  16:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv  17:            tokenizer.ggml.add_space_prefix bool             = true
+llama_model_loader: - kv  18:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  19:                         tokenizer.ggml.pre str              = default
+llama_model_loader: - kv  20:                      tokenizer.ggml.tokens arr[str,256000]  = ["<unk>", "<s>", "</s>", "<pad>", "<|...
+llama_model_loader: - kv  21:                      tokenizer.ggml.scores arr[f32,256000]  = [-1000.000000, -1000.000000, -1000.00...
+llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,256000]  = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
+llama_model_loader: - kv  23:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 2
+llama_model_loader: - kv  25:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  26:               tokenizer.ggml.add_bos_token bool             = true
+llama_model_loader: - kv  27:               tokenizer.ggml.add_eos_token bool             = false
+llama_model_loader: - kv  28:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   49 tensors
+llama_model_loader: - type bf16:  170 tensors
+================================ Have weights data with 168 entries
+[   1/ 219]                        output.weight - [ 2048, 256000,     1,     1], type =   bf16, size = 1000.000 MB
+[   2/ 219]                    token_embd.weight - [ 2048, 256000,     1,     1], type =   bf16,
+====== llama_model_quantize_internal: did not find weights for token_embd.weight
+converting to q5_K .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt'
+load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks
+prepare_imatrix: have 168 importance matrix entries
+size =  1000.00 MiB ->   343.75 MiB
+[   3/ 219]               blk.0.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   4/ 219]                blk.0.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[   5/ 219]                blk.0.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[   6/ 219]                  blk.0.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[   7/ 219]                blk.0.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   8/ 219]                  blk.0.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[   9/ 219]             blk.0.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  10/ 219]                  blk.0.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  11/ 219]                  blk.0.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  12/ 219]               blk.1.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  13/ 219]                blk.1.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  14/ 219]                blk.1.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  15/ 219]                  blk.1.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  16/ 219]                blk.1.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  17/ 219]                  blk.1.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  18/ 219]             blk.1.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  19/ 219]                  blk.1.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  20/ 219]                  blk.1.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  21/ 219]              blk.10.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  22/ 219]               blk.10.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  23/ 219]               blk.10.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  24/ 219]                 blk.10.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  25/ 219]               blk.10.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  26/ 219]                 blk.10.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  27/ 219]            blk.10.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  28/ 219]                 blk.10.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  29/ 219]                 blk.10.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  30/ 219]              blk.11.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  31/ 219]               blk.11.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  32/ 219]               blk.11.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  33/ 219]                 blk.11.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  34/ 219]               blk.11.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  35/ 219]                 blk.11.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  36/ 219]            blk.11.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  37/ 219]                 blk.11.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  38/ 219]                 blk.11.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  39/ 219]              blk.12.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  40/ 219]               blk.12.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  41/ 219]               blk.12.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  42/ 219]                 blk.12.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  43/ 219]               blk.12.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  44/ 219]                 blk.12.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  45/ 219]            blk.12.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  46/ 219]                 blk.12.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  47/ 219]                 blk.12.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  48/ 219]              blk.13.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  49/ 219]               blk.13.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  50/ 219]               blk.13.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  51/ 219]                 blk.13.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  52/ 219]               blk.13.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  53/ 219]                 blk.13.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  54/ 219]            blk.13.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  55/ 219]                 blk.13.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  56/ 219]                 blk.13.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  57/ 219]              blk.14.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  58/ 219]               blk.14.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  59/ 219]               blk.14.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  60/ 219]                 blk.14.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  61/ 219]               blk.14.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  62/ 219]                 blk.14.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  63/ 219]            blk.14.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  64/ 219]                 blk.14.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  65/ 219]                 blk.14.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  66/ 219]              blk.15.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  67/ 219]               blk.15.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  68/ 219]               blk.15.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  69/ 219]                 blk.15.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  70/ 219]               blk.15.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  71/ 219]                 blk.15.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  72/ 219]            blk.15.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  73/ 219]                 blk.15.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  74/ 219]                 blk.15.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  75/ 219]              blk.16.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  76/ 219]               blk.16.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  77/ 219]               blk.16.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  78/ 219]                 blk.16.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  79/ 219]               blk.16.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  80/ 219]                 blk.16.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  81/ 219]            blk.16.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  82/ 219]                 blk.16.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  83/ 219]                 blk.16.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  84/ 219]              blk.17.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  85/ 219]               blk.17.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  86/ 219]               blk.17.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  87/ 219]                 blk.17.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  88/ 219]               blk.17.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  89/ 219]                 blk.17.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  90/ 219]            blk.17.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  91/ 219]                 blk.17.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  92/ 219]                 blk.17.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  93/ 219]              blk.18.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  94/ 219]               blk.18.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[  95/ 219]               blk.18.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  96/ 219]                 blk.18.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[  97/ 219]               blk.18.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  98/ 219]                 blk.18.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[  99/ 219]            blk.18.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 100/ 219]                 blk.18.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 101/ 219]                 blk.18.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 102/ 219]              blk.19.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 103/ 219]               blk.19.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 104/ 219]               blk.19.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 105/ 219]                 blk.19.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 106/ 219]               blk.19.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 107/ 219]                 blk.19.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 108/ 219]            blk.19.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 109/ 219]                 blk.19.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 110/ 219]                 blk.19.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 111/ 219]               blk.2.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 112/ 219]                blk.2.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 113/ 219]                blk.2.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 114/ 219]                  blk.2.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 115/ 219]                blk.2.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 116/ 219]                  blk.2.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 117/ 219]             blk.2.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 118/ 219]                  blk.2.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 119/ 219]                  blk.2.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 120/ 219]              blk.20.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 121/ 219]               blk.20.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 122/ 219]               blk.20.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 123/ 219]                 blk.20.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 124/ 219]               blk.20.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 125/ 219]                 blk.20.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 126/ 219]            blk.20.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 127/ 219]                 blk.20.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 128/ 219]                 blk.20.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 129/ 219]              blk.21.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 130/ 219]               blk.21.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 131/ 219]               blk.21.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 132/ 219]                 blk.21.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 133/ 219]               blk.21.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 134/ 219]                 blk.21.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 135/ 219]            blk.21.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 136/ 219]                 blk.21.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 137/ 219]                 blk.21.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 138/ 219]              blk.22.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 139/ 219]               blk.22.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 140/ 219]               blk.22.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 141/ 219]                 blk.22.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 142/ 219]               blk.22.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 143/ 219]                 blk.22.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 144/ 219]            blk.22.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 145/ 219]                 blk.22.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 146/ 219]                 blk.22.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 147/ 219]              blk.23.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 148/ 219]               blk.23.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 149/ 219]               blk.23.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 150/ 219]                 blk.23.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 151/ 219]               blk.23.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 152/ 219]                 blk.23.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 153/ 219]            blk.23.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 154/ 219]                 blk.23.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 155/ 219]                 blk.23.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 156/ 219]               blk.3.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 157/ 219]                blk.3.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 158/ 219]                blk.3.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 159/ 219]                  blk.3.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 160/ 219]                blk.3.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 161/ 219]                  blk.3.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 162/ 219]             blk.3.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 163/ 219]                  blk.3.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 164/ 219]                  blk.3.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 165/ 219]               blk.4.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 166/ 219]                blk.4.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 167/ 219]                blk.4.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 168/ 219]                  blk.4.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 169/ 219]                blk.4.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 170/ 219]                  blk.4.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 171/ 219]             blk.4.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 172/ 219]                  blk.4.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 173/ 219]                  blk.4.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 174/ 219]               blk.5.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 175/ 219]                blk.5.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 176/ 219]                blk.5.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 177/ 219]                  blk.5.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 178/ 219]                blk.5.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 179/ 219]                  blk.5.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 180/ 219]             blk.5.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 181/ 219]                  blk.5.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 182/ 219]                  blk.5.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 183/ 219]               blk.6.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 184/ 219]                blk.6.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 185/ 219]                blk.6.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 186/ 219]                  blk.6.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 187/ 219]                blk.6.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 188/ 219]                  blk.6.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 189/ 219]             blk.6.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 190/ 219]                  blk.6.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 191/ 219]                  blk.6.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 192/ 219]               blk.7.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 193/ 219]                blk.7.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 194/ 219]                blk.7.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 195/ 219]                  blk.7.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 196/ 219]                blk.7.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 197/ 219]                  blk.7.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 198/ 219]             blk.7.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 199/ 219]                  blk.7.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 200/ 219]                  blk.7.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 201/ 219]               blk.8.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 202/ 219]                blk.8.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 203/ 219]                blk.8.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 204/ 219]                  blk.8.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 205/ 219]                blk.8.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 206/ 219]                  blk.8.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 207/ 219]             blk.8.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 208/ 219]                  blk.8.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 209/ 219]                  blk.8.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 210/ 219]               blk.9.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 211/ 219]                blk.9.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q5_K - using fallback quantization q5_1
+converting to q5_1 .. size =    21.25 MiB ->     7.97 MiB
+[ 212/ 219]                blk.9.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 213/ 219]                  blk.9.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q5_K .. size =    21.25 MiB ->     7.30 MiB
+[ 214/ 219]                blk.9.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 215/ 219]                  blk.9.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 216/ 219]             blk.9.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 217/ 219]                  blk.9.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 218/ 219]                  blk.9.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q5_K .. size =     8.00 MiB ->     2.75 MiB
+[ 219/ 219]                   output_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+llama_model_quantize_internal: model size  =  4298.38 MB
+llama_model_quantize_internal: quant size  =  2150.01 MB
+llama_model_quantize_internal: WARNING: 24 of 169 tensor(s) required fallback quantization
+main: quantize time = 10361.94 ms
+main:    total time = 10361.94 ms

Q6_K_log.txt ADDED Viewed

	@@ -0,0 +1,339 @@

+main: build = 3906 (7eee341b)
+main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0
+main: quantizing 'salamandra-2b_bf16.gguf' to './salamandra-2b_Q6_K.gguf' as Q6_K
+llama_model_loader: loaded meta data with 29 key-value pairs and 219 tensors from salamandra-2b_bf16.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                         general.size_label str              = 2.3B
+llama_model_loader: - kv   3:                            general.license str              = apache-2.0
+llama_model_loader: - kv   4:                               general.tags arr[str,1]       = ["text-generation"]
+llama_model_loader: - kv   5:                          general.languages arr[str,36]      = ["bg", "ca", "code", "cs", "cy", "da"...
+llama_model_loader: - kv   6:                          llama.block_count u32              = 24
+llama_model_loader: - kv   7:                       llama.context_length u32              = 8192
+llama_model_loader: - kv   8:                     llama.embedding_length u32              = 2048
+llama_model_loader: - kv   9:                  llama.feed_forward_length u32              = 5440
+llama_model_loader: - kv  10:                 llama.attention.head_count u32              = 16
+llama_model_loader: - kv  11:              llama.attention.head_count_kv u32              = 16
+llama_model_loader: - kv  12:                       llama.rope.freq_base f32              = 10000.000000
+llama_model_loader: - kv  13:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  14:                          general.file_type u32              = 32
+llama_model_loader: - kv  15:                           llama.vocab_size u32              = 256000
+llama_model_loader: - kv  16:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv  17:            tokenizer.ggml.add_space_prefix bool             = true
+llama_model_loader: - kv  18:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  19:                         tokenizer.ggml.pre str              = default
+llama_model_loader: - kv  20:                      tokenizer.ggml.tokens arr[str,256000]  = ["<unk>", "<s>", "</s>", "<pad>", "<|...
+llama_model_loader: - kv  21:                      tokenizer.ggml.scores arr[f32,256000]  = [-1000.000000, -1000.000000, -1000.00...
+llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,256000]  = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
+llama_model_loader: - kv  23:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 2
+llama_model_loader: - kv  25:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  26:               tokenizer.ggml.add_bos_token bool             = true
+llama_model_loader: - kv  27:               tokenizer.ggml.add_eos_token bool             = false
+llama_model_loader: - kv  28:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   49 tensors
+llama_model_loader: - type bf16:  170 tensors
+================================ Have weights data with 168 entries
+[   1/ 219]                        output.weight - [ 2048, 256000,     1,     1], type =   bf16, size = 1000.000 MB
+[   2/ 219]                    token_embd.weight - [ 2048, 256000,     1,     1], type =   bf16,
+====== llama_model_quantize_internal: did not find weights for token_embd.weight
+converting to q6_K .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt'
+load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks
+prepare_imatrix: have 168 importance matrix entries
+size =  1000.00 MiB ->   410.16 MiB
+[   3/ 219]               blk.0.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   4/ 219]                blk.0.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[   5/ 219]                blk.0.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[   6/ 219]                  blk.0.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[   7/ 219]                blk.0.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   8/ 219]                  blk.0.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[   9/ 219]             blk.0.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  10/ 219]                  blk.0.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  11/ 219]                  blk.0.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  12/ 219]               blk.1.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  13/ 219]                blk.1.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  14/ 219]                blk.1.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  15/ 219]                  blk.1.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  16/ 219]                blk.1.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  17/ 219]                  blk.1.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  18/ 219]             blk.1.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  19/ 219]                  blk.1.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  20/ 219]                  blk.1.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  21/ 219]              blk.10.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  22/ 219]               blk.10.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  23/ 219]               blk.10.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  24/ 219]                 blk.10.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  25/ 219]               blk.10.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  26/ 219]                 blk.10.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  27/ 219]            blk.10.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  28/ 219]                 blk.10.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  29/ 219]                 blk.10.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  30/ 219]              blk.11.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  31/ 219]               blk.11.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  32/ 219]               blk.11.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  33/ 219]                 blk.11.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  34/ 219]               blk.11.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  35/ 219]                 blk.11.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  36/ 219]            blk.11.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  37/ 219]                 blk.11.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  38/ 219]                 blk.11.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  39/ 219]              blk.12.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  40/ 219]               blk.12.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  41/ 219]               blk.12.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  42/ 219]                 blk.12.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  43/ 219]               blk.12.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  44/ 219]                 blk.12.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  45/ 219]            blk.12.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  46/ 219]                 blk.12.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  47/ 219]                 blk.12.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  48/ 219]              blk.13.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  49/ 219]               blk.13.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  50/ 219]               blk.13.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  51/ 219]                 blk.13.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  52/ 219]               blk.13.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  53/ 219]                 blk.13.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  54/ 219]            blk.13.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  55/ 219]                 blk.13.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  56/ 219]                 blk.13.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  57/ 219]              blk.14.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  58/ 219]               blk.14.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  59/ 219]               blk.14.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  60/ 219]                 blk.14.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  61/ 219]               blk.14.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  62/ 219]                 blk.14.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  63/ 219]            blk.14.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  64/ 219]                 blk.14.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  65/ 219]                 blk.14.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  66/ 219]              blk.15.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  67/ 219]               blk.15.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  68/ 219]               blk.15.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  69/ 219]                 blk.15.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  70/ 219]               blk.15.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  71/ 219]                 blk.15.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  72/ 219]            blk.15.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  73/ 219]                 blk.15.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  74/ 219]                 blk.15.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  75/ 219]              blk.16.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  76/ 219]               blk.16.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  77/ 219]               blk.16.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  78/ 219]                 blk.16.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  79/ 219]               blk.16.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  80/ 219]                 blk.16.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  81/ 219]            blk.16.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  82/ 219]                 blk.16.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  83/ 219]                 blk.16.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  84/ 219]              blk.17.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  85/ 219]               blk.17.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  86/ 219]               blk.17.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  87/ 219]                 blk.17.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  88/ 219]               blk.17.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  89/ 219]                 blk.17.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  90/ 219]            blk.17.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  91/ 219]                 blk.17.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  92/ 219]                 blk.17.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  93/ 219]              blk.18.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  94/ 219]               blk.18.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  95/ 219]               blk.18.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  96/ 219]                 blk.18.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[  97/ 219]               blk.18.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  98/ 219]                 blk.18.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[  99/ 219]            blk.18.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 100/ 219]                 blk.18.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 101/ 219]                 blk.18.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 102/ 219]              blk.19.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 103/ 219]               blk.19.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 104/ 219]               blk.19.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 105/ 219]                 blk.19.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 106/ 219]               blk.19.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 107/ 219]                 blk.19.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 108/ 219]            blk.19.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 109/ 219]                 blk.19.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 110/ 219]                 blk.19.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 111/ 219]               blk.2.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 112/ 219]                blk.2.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 113/ 219]                blk.2.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 114/ 219]                  blk.2.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 115/ 219]                blk.2.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 116/ 219]                  blk.2.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 117/ 219]             blk.2.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 118/ 219]                  blk.2.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 119/ 219]                  blk.2.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 120/ 219]              blk.20.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 121/ 219]               blk.20.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 122/ 219]               blk.20.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 123/ 219]                 blk.20.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 124/ 219]               blk.20.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 125/ 219]                 blk.20.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 126/ 219]            blk.20.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 127/ 219]                 blk.20.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 128/ 219]                 blk.20.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 129/ 219]              blk.21.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 130/ 219]               blk.21.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 131/ 219]               blk.21.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 132/ 219]                 blk.21.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 133/ 219]               blk.21.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 134/ 219]                 blk.21.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 135/ 219]            blk.21.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 136/ 219]                 blk.21.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 137/ 219]                 blk.21.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 138/ 219]              blk.22.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 139/ 219]               blk.22.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 140/ 219]               blk.22.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 141/ 219]                 blk.22.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 142/ 219]               blk.22.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 143/ 219]                 blk.22.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 144/ 219]            blk.22.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 145/ 219]                 blk.22.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 146/ 219]                 blk.22.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 147/ 219]              blk.23.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 148/ 219]               blk.23.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 149/ 219]               blk.23.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 150/ 219]                 blk.23.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 151/ 219]               blk.23.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 152/ 219]                 blk.23.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 153/ 219]            blk.23.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 154/ 219]                 blk.23.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 155/ 219]                 blk.23.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 156/ 219]               blk.3.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 157/ 219]                blk.3.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 158/ 219]                blk.3.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 159/ 219]                  blk.3.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 160/ 219]                blk.3.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 161/ 219]                  blk.3.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 162/ 219]             blk.3.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 163/ 219]                  blk.3.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 164/ 219]                  blk.3.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 165/ 219]               blk.4.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 166/ 219]                blk.4.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 167/ 219]                blk.4.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 168/ 219]                  blk.4.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 169/ 219]                blk.4.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 170/ 219]                  blk.4.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 171/ 219]             blk.4.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 172/ 219]                  blk.4.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 173/ 219]                  blk.4.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 174/ 219]               blk.5.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 175/ 219]                blk.5.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 176/ 219]                blk.5.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 177/ 219]                  blk.5.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 178/ 219]                blk.5.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 179/ 219]                  blk.5.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 180/ 219]             blk.5.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 181/ 219]                  blk.5.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 182/ 219]                  blk.5.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 183/ 219]               blk.6.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 184/ 219]                blk.6.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 185/ 219]                blk.6.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 186/ 219]                  blk.6.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 187/ 219]                blk.6.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 188/ 219]                  blk.6.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 189/ 219]             blk.6.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 190/ 219]                  blk.6.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 191/ 219]                  blk.6.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 192/ 219]               blk.7.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 193/ 219]                blk.7.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 194/ 219]                blk.7.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 195/ 219]                  blk.7.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 196/ 219]                blk.7.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 197/ 219]                  blk.7.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 198/ 219]             blk.7.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 199/ 219]                  blk.7.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 200/ 219]                  blk.7.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 201/ 219]               blk.8.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 202/ 219]                blk.8.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 203/ 219]                blk.8.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 204/ 219]                  blk.8.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 205/ 219]                blk.8.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 206/ 219]                  blk.8.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 207/ 219]             blk.8.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 208/ 219]                  blk.8.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 209/ 219]                  blk.8.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 210/ 219]               blk.9.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 211/ 219]                blk.9.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16,
+llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for q6_K - using fallback quantization q8_0
+converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 212/ 219]                blk.9.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 213/ 219]                  blk.9.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q6_K .. size =    21.25 MiB ->     8.72 MiB
+[ 214/ 219]                blk.9.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 215/ 219]                  blk.9.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 216/ 219]             blk.9.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 217/ 219]                  blk.9.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 218/ 219]                  blk.9.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB
+[ 219/ 219]                   output_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+llama_model_quantize_internal: model size  =  4298.38 MB
+llama_model_quantize_internal: quant size  =  2414.84 MB
+llama_model_quantize_internal: WARNING: 24 of 169 tensor(s) required fallback quantization
+main: quantize time =  4934.86 ms
+main:    total time =  4934.86 ms

Q8_0_log.txt ADDED Viewed

	@@ -0,0 +1,266 @@

+main: build = 3906 (7eee341b)
+main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0
+main: quantizing 'salamandra-2b_bf16.gguf' to './salamandra-2b_Q8_0.gguf' as Q8_0
+llama_model_loader: loaded meta data with 29 key-value pairs and 219 tensors from salamandra-2b_bf16.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                         general.size_label str              = 2.3B
+llama_model_loader: - kv   3:                            general.license str              = apache-2.0
+llama_model_loader: - kv   4:                               general.tags arr[str,1]       = ["text-generation"]
+llama_model_loader: - kv   5:                          general.languages arr[str,36]      = ["bg", "ca", "code", "cs", "cy", "da"...
+llama_model_loader: - kv   6:                          llama.block_count u32              = 24
+llama_model_loader: - kv   7:                       llama.context_length u32              = 8192
+llama_model_loader: - kv   8:                     llama.embedding_length u32              = 2048
+llama_model_loader: - kv   9:                  llama.feed_forward_length u32              = 5440
+llama_model_loader: - kv  10:                 llama.attention.head_count u32              = 16
+llama_model_loader: - kv  11:              llama.attention.head_count_kv u32              = 16
+llama_model_loader: - kv  12:                       llama.rope.freq_base f32              = 10000.000000
+llama_model_loader: - kv  13:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  14:                          general.file_type u32              = 32
+llama_model_loader: - kv  15:                           llama.vocab_size u32              = 256000
+llama_model_loader: - kv  16:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv  17:            tokenizer.ggml.add_space_prefix bool             = true
+llama_model_loader: - kv  18:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  19:                         tokenizer.ggml.pre str              = default
+llama_model_loader: - kv  20:                      tokenizer.ggml.tokens arr[str,256000]  = ["<unk>", "<s>", "</s>", "<pad>", "<|...
+llama_model_loader: - kv  21:                      tokenizer.ggml.scores arr[f32,256000]  = [-1000.000000, -1000.000000, -1000.00...
+llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,256000]  = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
+llama_model_loader: - kv  23:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 2
+llama_model_loader: - kv  25:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  26:               tokenizer.ggml.add_bos_token bool             = true
+llama_model_loader: - kv  27:               tokenizer.ggml.add_eos_token bool             = false
+llama_model_loader: - kv  28:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   49 tensors
+llama_model_loader: - type bf16:  170 tensors
+================================ Have weights data with 168 entries
+[   1/ 219]                        output.weight - [ 2048, 256000,     1,     1], type =   bf16, size = 1000.000 MB
+[   2/ 219]                    token_embd.weight - [ 2048, 256000,     1,     1], type =   bf16,
+====== llama_model_quantize_internal: did not find weights for token_embd.weight
+converting to q8_0 .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt'
+load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks
+prepare_imatrix: have 168 importance matrix entries
+size =  1000.00 MiB ->   531.25 MiB
+[   3/ 219]               blk.0.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   4/ 219]                blk.0.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[   5/ 219]                blk.0.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[   6/ 219]                  blk.0.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[   7/ 219]                blk.0.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[   8/ 219]                  blk.0.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[   9/ 219]             blk.0.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  10/ 219]                  blk.0.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  11/ 219]                  blk.0.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  12/ 219]               blk.1.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  13/ 219]                blk.1.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  14/ 219]                blk.1.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  15/ 219]                  blk.1.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  16/ 219]                blk.1.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  17/ 219]                  blk.1.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  18/ 219]             blk.1.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  19/ 219]                  blk.1.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  20/ 219]                  blk.1.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  21/ 219]              blk.10.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  22/ 219]               blk.10.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  23/ 219]               blk.10.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  24/ 219]                 blk.10.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  25/ 219]               blk.10.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  26/ 219]                 blk.10.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  27/ 219]            blk.10.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  28/ 219]                 blk.10.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  29/ 219]                 blk.10.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  30/ 219]              blk.11.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  31/ 219]               blk.11.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  32/ 219]               blk.11.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  33/ 219]                 blk.11.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  34/ 219]               blk.11.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  35/ 219]                 blk.11.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  36/ 219]            blk.11.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  37/ 219]                 blk.11.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  38/ 219]                 blk.11.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  39/ 219]              blk.12.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  40/ 219]               blk.12.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  41/ 219]               blk.12.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  42/ 219]                 blk.12.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  43/ 219]               blk.12.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  44/ 219]                 blk.12.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  45/ 219]            blk.12.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  46/ 219]                 blk.12.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  47/ 219]                 blk.12.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  48/ 219]              blk.13.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  49/ 219]               blk.13.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  50/ 219]               blk.13.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  51/ 219]                 blk.13.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  52/ 219]               blk.13.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  53/ 219]                 blk.13.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  54/ 219]            blk.13.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  55/ 219]                 blk.13.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  56/ 219]                 blk.13.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  57/ 219]              blk.14.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  58/ 219]               blk.14.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  59/ 219]               blk.14.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  60/ 219]                 blk.14.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  61/ 219]               blk.14.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  62/ 219]                 blk.14.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  63/ 219]            blk.14.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  64/ 219]                 blk.14.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  65/ 219]                 blk.14.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  66/ 219]              blk.15.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  67/ 219]               blk.15.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  68/ 219]               blk.15.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  69/ 219]                 blk.15.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  70/ 219]               blk.15.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  71/ 219]                 blk.15.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  72/ 219]            blk.15.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  73/ 219]                 blk.15.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  74/ 219]                 blk.15.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  75/ 219]              blk.16.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  76/ 219]               blk.16.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  77/ 219]               blk.16.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  78/ 219]                 blk.16.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  79/ 219]               blk.16.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  80/ 219]                 blk.16.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  81/ 219]            blk.16.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  82/ 219]                 blk.16.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  83/ 219]                 blk.16.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  84/ 219]              blk.17.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  85/ 219]               blk.17.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  86/ 219]               blk.17.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  87/ 219]                 blk.17.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  88/ 219]               blk.17.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  89/ 219]                 blk.17.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  90/ 219]            blk.17.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  91/ 219]                 blk.17.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  92/ 219]                 blk.17.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  93/ 219]              blk.18.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  94/ 219]               blk.18.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  95/ 219]               blk.18.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  96/ 219]                 blk.18.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[  97/ 219]               blk.18.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[  98/ 219]                 blk.18.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[  99/ 219]            blk.18.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 100/ 219]                 blk.18.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 101/ 219]                 blk.18.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 102/ 219]              blk.19.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 103/ 219]               blk.19.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 104/ 219]               blk.19.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 105/ 219]                 blk.19.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 106/ 219]               blk.19.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 107/ 219]                 blk.19.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 108/ 219]            blk.19.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 109/ 219]                 blk.19.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 110/ 219]                 blk.19.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 111/ 219]               blk.2.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 112/ 219]                blk.2.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 113/ 219]                blk.2.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 114/ 219]                  blk.2.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 115/ 219]                blk.2.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 116/ 219]                  blk.2.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 117/ 219]             blk.2.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 118/ 219]                  blk.2.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 119/ 219]                  blk.2.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 120/ 219]              blk.20.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 121/ 219]               blk.20.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 122/ 219]               blk.20.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 123/ 219]                 blk.20.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 124/ 219]               blk.20.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 125/ 219]                 blk.20.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 126/ 219]            blk.20.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 127/ 219]                 blk.20.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 128/ 219]                 blk.20.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 129/ 219]              blk.21.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 130/ 219]               blk.21.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 131/ 219]               blk.21.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 132/ 219]                 blk.21.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 133/ 219]               blk.21.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 134/ 219]                 blk.21.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 135/ 219]            blk.21.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 136/ 219]                 blk.21.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 137/ 219]                 blk.21.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 138/ 219]              blk.22.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 139/ 219]               blk.22.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 140/ 219]               blk.22.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 141/ 219]                 blk.22.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 142/ 219]               blk.22.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 143/ 219]                 blk.22.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 144/ 219]            blk.22.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 145/ 219]                 blk.22.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 146/ 219]                 blk.22.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 147/ 219]              blk.23.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 148/ 219]               blk.23.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 149/ 219]               blk.23.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 150/ 219]                 blk.23.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 151/ 219]               blk.23.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 152/ 219]                 blk.23.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 153/ 219]            blk.23.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 154/ 219]                 blk.23.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 155/ 219]                 blk.23.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 156/ 219]               blk.3.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 157/ 219]                blk.3.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 158/ 219]                blk.3.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 159/ 219]                  blk.3.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 160/ 219]                blk.3.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 161/ 219]                  blk.3.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 162/ 219]             blk.3.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 163/ 219]                  blk.3.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 164/ 219]                  blk.3.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 165/ 219]               blk.4.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 166/ 219]                blk.4.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 167/ 219]                blk.4.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 168/ 219]                  blk.4.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 169/ 219]                blk.4.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 170/ 219]                  blk.4.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 171/ 219]             blk.4.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 172/ 219]                  blk.4.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 173/ 219]                  blk.4.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 174/ 219]               blk.5.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 175/ 219]                blk.5.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 176/ 219]                blk.5.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 177/ 219]                  blk.5.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 178/ 219]                blk.5.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 179/ 219]                  blk.5.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 180/ 219]             blk.5.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 181/ 219]                  blk.5.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 182/ 219]                  blk.5.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 183/ 219]               blk.6.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 184/ 219]                blk.6.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 185/ 219]                blk.6.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 186/ 219]                  blk.6.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 187/ 219]                blk.6.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 188/ 219]                  blk.6.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 189/ 219]             blk.6.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 190/ 219]                  blk.6.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 191/ 219]                  blk.6.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 192/ 219]               blk.7.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 193/ 219]                blk.7.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 194/ 219]                blk.7.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 195/ 219]                  blk.7.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 196/ 219]                blk.7.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 197/ 219]                  blk.7.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 198/ 219]             blk.7.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 199/ 219]                  blk.7.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 200/ 219]                  blk.7.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 201/ 219]               blk.8.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 202/ 219]                blk.8.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 203/ 219]                blk.8.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 204/ 219]                  blk.8.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 205/ 219]                blk.8.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 206/ 219]                  blk.8.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 207/ 219]             blk.8.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 208/ 219]                  blk.8.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 209/ 219]                  blk.8.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 210/ 219]               blk.9.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 211/ 219]                blk.9.ffn_down.weight - [ 5440,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 212/ 219]                blk.9.ffn_gate.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 213/ 219]                  blk.9.ffn_up.weight - [ 2048,  5440,     1,     1], type =   bf16, converting to q8_0 .. size =    21.25 MiB ->    11.29 MiB
+[ 214/ 219]                blk.9.ffn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+[ 215/ 219]                  blk.9.attn_k.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 216/ 219]             blk.9.attn_output.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 217/ 219]                  blk.9.attn_q.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 218/ 219]                  blk.9.attn_v.weight - [ 2048,  2048,     1,     1], type =   bf16, converting to q8_0 .. size =     8.00 MiB ->     4.25 MiB
+[ 219/ 219]                   output_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB
+llama_model_quantize_internal: model size  =  4298.38 MB
+llama_model_quantize_internal: quant size  =  2752.45 MB
+main: quantize time =  3216.17 ms
+main:    total time =  3216.17 ms

README.md CHANGED Viewed

@@ -27,7 +27,7 @@ language:
 - mt
 - nl
 - nn
-- no
 - oc
 - pl
 - pt
@@ -41,6 +41,59 @@ language:
 - uk
 ---
 ![](./images/salamandra_header.png)
 # Salamandra Model Card

 - mt
 - nl
 - nn
+- \no
 - oc
 - pl
 - pt
 - uk
 ---
+# **Quantization Summary**
+- **IQ4_NL**: Best I quantization below **Q4** with minimal PPL impact.
+- **Q5_K_M**: Excellent balance above **Q4**, recommended for most applications.
+- **Q6_K**: Provides near-**bf16** performance with size savings.
+---
+# Quantization
+### **Perplexity Comparison Table:**
+| **Quantization Type** | **PPL**    | **ln(PPL(Q)/PPL(bf16))** | **File Size** | **Notes**                                                      |
+|-----------------------|------------|--------------------------|---------------|----------------------------------------------------------------|
+| **IQ3_M**             | 15.1995    | 0.079131                 | 1.7G          | Good size efficiency with acceptable PPL increase              |
+| **Q3_K_L**            | 15.0444    | 0.068875                 | 1.8G          | Further size reduction with modest PPL increase                |
+| **IQ4_NL**            | 14.5534    | 0.035693                 | 1.9G          | Good size reduction with minimal PPL impact (**recommended**)  |
+| **Q4_K_M**            | 14.399     | 0.025028                 | 2.0G          | Smaller with acceptable PPL  |
+| **Q5_K_M**            | 14.1299    | 0.006162                 | 2.2G          | Excellent balance of PPL and size (**recommended**)            |
+| **Q6_K**              | 14.0675    | 0.001736                 | 2.4G          | Nearly lossless performance with reduced size |
+| **bf16**              | 14.0431    | 0.0                      | 4.2G          | Baseline                                                       |
+---
+### **Notes:**
+- **Recommended Quantizations:**
+  - **IQ4_NL**: Represents the best of the I quantization types below **Q4**, achieving good size efficiency while maintaining low perplexity.
+  - **Q5_K_M**: Offers the best balance between low perplexity and reduced file size above **Q4**, making it ideal for most applications.
+  - **Q6_K**: Delivers nearly lossless performance compared to **bf16** with a reduced file size (2.4G vs. 4.2G). Ideal for scenarios requiring maximum accuracy with some size savings.
+- **Non-recommended Quanizations:**
+  - **IQ3_M**: Offers a smaller file size (1.7G) with an acceptable PPL increase (15.1995), making it a solid choice for highly compressed models.
+  - **Q3_K_L**: Provides a slightly larger file size (1.8G) with an even better PPL (15.0444). Fits within the selection criteria for highly compressed models with log PPL diff <0.3.
+  - **Q4_K_M**: While **Q4_K_M** is not designated as "recommended" in this context, it is highly suitable for architectures like **Metal**, which run **I-quant** models slowly. For such architectures, **Q4_K_M** remains an excellent choice.
+  - **Q6_K** Similar to Q8_0, offers very close perplexity to bf16. Given its smaller file size than Q8_0 (2.4G vs. 2.7G), Q6_K provides a better size-to-performance trade-off. It was selected because it is nearly lossless and less than 2.5GB.
+---
+### **Defending the Selection:**
+The selection of recommended models is designed to provide a spectrum of options that meet the following criteria:
+- **Diversity in Quantization Types:**
+  - **I Quantization Below Q4:** **IQ4_NL** is included to offer an option that uses I quantization below the **Q4** level, balancing size and performance.
+  - **K Quantization At and Above Q4:** **Q4_K_M**, **Q5_K_M**, and **Q6_K** provide K quantization options at and above the **Q4** level, giving users choices based on their specific needs.
+  - **Highly Compressed Quantization (Q3 and below):** **IQ3_M** and **Q3_K_L** are included as they meet the selection criteria of log PPL diff <0.3 and are not redundant with other models.
+- **Selection Criteria:**
+  - **Log PPL diff <0.3:** All included models have a log PPL difference under 0.3, ensuring that they maintain acceptable performance even when highly quantized.
+  - **No Multiple Models Within 100MB of the Same File Size:** Only one model is included per similar file size range to avoid redundancy. For example, **Q3_K_L** (1.8G) is included while other models like **IQ3_XS** (1.7G) are excluded due to overlapping file sizes and comparable PPL, ensuring a sparse yet comprehensive selection.
 ![](./images/salamandra_header.png)
 # Salamandra Model Card

bf16_log.txt ADDED Viewed

	@@ -0,0 +1,245 @@

+/Users/Shared/Public/Github/llama.cpp/convert_hf_to_gguf.py --outtype bf16 . --outfile ./salamandra-2b_bf16.gguf
+INFO:hf-to-gguf:Loading model:
+INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
+INFO:hf-to-gguf:Exporting model...
+INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
+INFO:hf-to-gguf:output.weight,               torch.bfloat16 --> BF16, shape = {2048, 256000}
+INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> BF16, shape = {2048, 256000}
+INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.0.attn_output.weight,    torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.0.attn_q.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.0.attn_v.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.1.attn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.1.ffn_down.weight,       torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.1.ffn_gate.weight,       torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.1.ffn_up.weight,         torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.1.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.1.attn_k.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.1.attn_output.weight,    torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.1.attn_q.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.1.attn_v.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.10.attn_norm.weight,     torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.10.ffn_down.weight,      torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.10.ffn_gate.weight,      torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.10.ffn_up.weight,        torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.10.ffn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.10.attn_k.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.10.attn_output.weight,   torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.10.attn_q.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.10.attn_v.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.11.attn_norm.weight,     torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.11.ffn_down.weight,      torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.11.ffn_gate.weight,      torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.11.ffn_up.weight,        torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.11.ffn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.11.attn_k.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.11.attn_output.weight,   torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.11.attn_q.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.11.attn_v.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.12.attn_norm.weight,     torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.12.ffn_down.weight,      torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.12.ffn_gate.weight,      torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.12.ffn_up.weight,        torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.12.ffn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.12.attn_k.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.12.attn_output.weight,   torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.12.attn_q.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.12.attn_v.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.13.attn_norm.weight,     torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.13.ffn_down.weight,      torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.13.ffn_gate.weight,      torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.13.ffn_up.weight,        torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.13.ffn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.13.attn_k.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.13.attn_output.weight,   torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.13.attn_q.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.13.attn_v.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.14.attn_norm.weight,     torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.14.ffn_down.weight,      torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.14.ffn_gate.weight,      torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.14.ffn_up.weight,        torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.14.ffn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.14.attn_k.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.14.attn_output.weight,   torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.14.attn_q.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.14.attn_v.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.15.attn_norm.weight,     torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.15.ffn_down.weight,      torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.15.ffn_gate.weight,      torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.15.ffn_up.weight,        torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.15.ffn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.15.attn_k.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.15.attn_output.weight,   torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.15.attn_q.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.15.attn_v.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.16.attn_norm.weight,     torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.16.ffn_down.weight,      torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.16.ffn_gate.weight,      torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.16.ffn_up.weight,        torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.16.ffn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.16.attn_k.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.16.attn_output.weight,   torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.16.attn_q.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.16.attn_v.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.17.attn_norm.weight,     torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.17.ffn_down.weight,      torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.17.ffn_gate.weight,      torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.17.ffn_up.weight,        torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.17.ffn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.17.attn_k.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.17.attn_output.weight,   torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.17.attn_q.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.17.attn_v.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.18.attn_norm.weight,     torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.18.ffn_down.weight,      torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.18.ffn_gate.weight,      torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.18.ffn_up.weight,        torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.18.ffn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.18.attn_k.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.18.attn_output.weight,   torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.18.attn_q.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.18.attn_v.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.19.attn_norm.weight,     torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.19.ffn_down.weight,      torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.19.ffn_gate.weight,      torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.19.ffn_up.weight,        torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.19.ffn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.19.attn_k.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.19.attn_output.weight,   torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.19.attn_q.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.19.attn_v.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.2.attn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.2.ffn_down.weight,       torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.2.ffn_gate.weight,       torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.2.ffn_up.weight,         torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.2.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.2.attn_k.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.2.attn_output.weight,    torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.2.attn_q.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.2.attn_v.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.20.attn_norm.weight,     torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.20.ffn_down.weight,      torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.20.ffn_gate.weight,      torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.20.ffn_up.weight,        torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.20.ffn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.20.attn_k.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.20.attn_output.weight,   torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.20.attn_q.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.20.attn_v.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.21.attn_norm.weight,     torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.21.ffn_down.weight,      torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.21.ffn_gate.weight,      torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.21.ffn_up.weight,        torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.21.ffn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.21.attn_k.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.21.attn_output.weight,   torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.21.attn_q.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.21.attn_v.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.22.attn_norm.weight,     torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.22.ffn_down.weight,      torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.22.ffn_gate.weight,      torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.22.ffn_up.weight,        torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.22.ffn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.22.attn_k.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.22.attn_output.weight,   torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.22.attn_q.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.22.attn_v.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.23.attn_norm.weight,     torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.23.ffn_down.weight,      torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.23.ffn_gate.weight,      torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.23.ffn_up.weight,        torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.23.ffn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.23.attn_k.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.23.attn_output.weight,   torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.23.attn_q.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.23.attn_v.weight,        torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.3.attn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.3.ffn_down.weight,       torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.3.ffn_gate.weight,       torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.3.ffn_up.weight,         torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.3.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.3.attn_k.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.3.attn_output.weight,    torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.3.attn_q.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.3.attn_v.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.4.attn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.4.ffn_down.weight,       torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.4.ffn_gate.weight,       torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.4.ffn_up.weight,         torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.4.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.4.attn_k.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.4.attn_output.weight,    torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.4.attn_q.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.4.attn_v.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.5.attn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.5.ffn_down.weight,       torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.5.ffn_gate.weight,       torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.5.ffn_up.weight,         torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.5.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.5.attn_k.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.5.attn_output.weight,    torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.5.attn_q.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.5.attn_v.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.6.attn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.6.ffn_down.weight,       torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.6.ffn_gate.weight,       torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.6.ffn_up.weight,         torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.6.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.6.attn_k.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.6.attn_output.weight,    torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.6.attn_q.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.6.attn_v.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.7.attn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.7.ffn_down.weight,       torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.7.ffn_gate.weight,       torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.7.ffn_up.weight,         torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.7.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.7.attn_k.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.7.attn_output.weight,    torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.7.attn_q.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.7.attn_v.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.8.attn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.8.ffn_down.weight,       torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.8.ffn_gate.weight,       torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.8.ffn_up.weight,         torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.8.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.8.attn_k.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.8.attn_output.weight,    torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.8.attn_q.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.8.attn_v.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.9.attn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.9.ffn_down.weight,       torch.bfloat16 --> BF16, shape = {5440, 2048}
+INFO:hf-to-gguf:blk.9.ffn_gate.weight,       torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.9.ffn_up.weight,         torch.bfloat16 --> BF16, shape = {2048, 5440}
+INFO:hf-to-gguf:blk.9.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:blk.9.attn_k.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.9.attn_output.weight,    torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.9.attn_q.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:blk.9.attn_v.weight,         torch.bfloat16 --> BF16, shape = {2048, 2048}
+INFO:hf-to-gguf:output_norm.weight,          torch.bfloat16 --> F32, shape = {2048}
+INFO:hf-to-gguf:Set meta model
+INFO:hf-to-gguf:Set model parameters
+INFO:hf-to-gguf:gguf: context length = 8192
+INFO:hf-to-gguf:gguf: embedding length = 2048
+INFO:hf-to-gguf:gguf: feed forward length = 5440
+INFO:hf-to-gguf:gguf: head count = 16
+INFO:hf-to-gguf:gguf: key-value head count = 16
+INFO:hf-to-gguf:gguf: rope theta = 10000.0
+INFO:hf-to-gguf:gguf: rms norm epsilon = 1e-05
+INFO:hf-to-gguf:gguf: file type = 32
+INFO:hf-to-gguf:Set model tokenizer
+INFO:gguf.vocab:Setting special token type bos to 1
+INFO:gguf.vocab:Setting special token type eos to 2
+INFO:gguf.vocab:Setting special token type unk to 0
+INFO:gguf.vocab:Setting add_bos_token to True
+INFO:gguf.vocab:Setting add_eos_token to False
+INFO:hf-to-gguf:Set model quantization version
+INFO:gguf.gguf_writer:Writing the following files:
+INFO:gguf.gguf_writer:salamandra-2b_bf16.gguf: n_tensors = 219, total_size = 4.5G
+Writing: 100%|████████████████████████████████████████████████████████████████████| 4.51G/4.51G [00:10<00:00, 419Mbyte/s]
+INFO:hf-to-gguf:Model successfully exported to salamandra-2b_bf16.gguf

git_snapshot.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+upstream: https://huggingface.co/BSC-LT/salamandra-2b
+branch: origin/main
+hash: f1f8713d7c0114f1f60fc274428cd158039e7425

model.safetensors → imatrix/oscar/imatrix-dataset.txt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7ec596bbf8f95408a120b009f320621cdcd2f861b3e373e770fe5a12754de66b
-size 4507005744

 version https://git-lfs.github.com/spec/v1
+oid sha256:c7830c495e88be1484f0fdd9a4f0b405cb88c0482283b0cf478238aabfcf2840
+size 101131321

tokenizer.model → imatrix/oscar/imatrix.dat RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fa490e57cebce5cb1a0a5b1a5d3fa4de05aee53dc3a44791f1c3401db44d802d
-size 4813274

 version https://git-lfs.github.com/spec/v1
+oid sha256:194d585dd3bb27574dad0e5da861492ac47104929a182fcebbd88c41567a95e9
+size 1707457

imatrix/oscar/langs/bg.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7db75e8d3ab29d086a9f95843710c8e26eb82492feaba8892d98f7cd42c958c5
+size 15369727

imatrix/oscar/langs/ca.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:620a3fb2dadf2ad55e32248e1dc5455f7f27470a1e25893f0470eaf53b9e4364
+size 2452735

imatrix/oscar/langs/cs.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4bac9a98f322e5773b8e3e6abcd06ce243ad9ce1edf992db0ab5d9eea6e17490
+size 6374140

imatrix/oscar/langs/cy.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:41072c1798493fd14eaebd5ee9820d71ea803ee42b793f40248ab536ff5421c4
+size 1711900

imatrix/oscar/langs/da.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e312b03a2cdf388224719c8e8b4b34911db629e77a818a6b272151c0746308a
+size 2845693

imatrix/oscar/langs/de.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4c2008f2af94e4f62cca8dd03eb4e439b916b689b82545345369bec5c81a024
+size 2820404

imatrix/oscar/langs/el.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fd2393f1b32910ca111e8a3d86a29b3c7ad5d2ca81ce756411c34f133759e1d
+size 22016087

imatrix/oscar/langs/en.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af4f2a2b491f7a7c5eede69a5869762d7f2b4ef9f5e8960f3e34c96754f3e4b0
+size 6204830

imatrix/oscar/langs/es.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71fbce9f6c2c8875a37c98bd8eed4144b64c91596214ff35676a3893d2d810ad
+size 4007613

imatrix/oscar/langs/et.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e706f41af29936a9033192e73b11637bd0160c5d176beae051e9fbc2f8719d0
+size 2366889

imatrix/oscar/langs/eu.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce9ef1dc787847b03b0d6af4bee5a097b38f55a9a3c3e9fc85727c1b50a2c964
+size 793185

imatrix/oscar/langs/fi.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:652fc7fb857799b7e01382240f340324a116d1bd0d60f7ac78c2112b2b264581
+size 4324807

imatrix/oscar/langs/fr.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75ff98d097421bc9d6dad95c84904fb25241123e6a397a47759dd5dba1b54246
+size 3227190

imatrix/oscar/langs/ga.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:21683a371855abef07d7f84b05a03617d60e969498eea2ba3b589f3997c2d6aa
+size 1912591

imatrix/oscar/langs/gl.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7025affdb2538372f012f4213048ebaca72aa14fbbb14983d91f821aa5a42ef6
+size 1375166

imatrix/oscar/langs/hr.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:838dce29fce8638d515bc64e3d33763a25c2f13a1ae5878e1412f23797345411
+size 469023

imatrix/oscar/langs/hu.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d856286560de976671741744705c7ef227be2dd648bcf320fe63a0a6ef1aeea1
+size 6021141

imatrix/oscar/langs/it.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:220bc92cd65fbd5b432a453debe102e1b0c947a492cec6d3ff5a5b87cf3c7eac
+size 3893483

imatrix/oscar/langs/lt.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db1dff9403e72501447e46fb959b92740ca94afe57ada684e010e8dae3875559
+size 3543428

imatrix/oscar/langs/lv.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0fa438a0385c7b8d5f6cdefdc7a025af13bd85916409c3cc0087d2589b91d2a
+size 2837848

imatrix/oscar/langs/mt.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:097c73ca100485b1b6f7cdfee2e34ef6cf85bdf350cbf91e0328017adbcdab73
+size 966065

imatrix/oscar/langs/nl.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:749ff8293fdf1552e4e1c309431f4305260676945882c721e4287c1e7608d6c7
+size 3201009

imatrix/oscar/langs/nn.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:996c7311ab41ee25bee21d9125e8bb7fefcc3352ca4edaf2429b8e5ff0f3ad42
+size 473060

imatrix/oscar/langs/no.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:018f84ce699d6ce8dcffc1f4ea9ce69ce29f60dff5c822039c1575f41e6f92fa
+size 2004094

imatrix/oscar/langs/oc.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4f77cb54290063b6089120728e15468db7b212bf617c4678014503c866ede5c
+size 672153

imatrix/oscar/langs/pl.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8d98cc00d727be32032107f9deae185aa2aac04d79a563b39069b50c288d09d
+size 3187625

imatrix/oscar/langs/pt.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2d66c7bb0058423bf5bb97809c907bffa439e2ca48eac4a1fd87a0c2475c25c
+size 3828558