OccamRazor commited on
Commit
899145d
·
1 Parent(s): 1d65fea

Vulkan IQ4_NL Support (llama/8613)

Browse files

* Fix Vulkan matmul tests compile errors

* Add Vulkan IQ4_NL support

* Fix Vulkan DeepSeek-Coder-V2-Lite MoE support

Files changed (1) hide show
  1. ggml/src/ggml-vulkan.cpp +140 -182
ggml/src/ggml-vulkan.cpp CHANGED
@@ -38,8 +38,6 @@
38
  #define VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI 1
39
  #define VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE 2
40
 
41
- #define VK_NUM_TYPES 16
42
-
43
  #define GGML_VK_MAX_NODES 8192
44
 
45
  #define MAX_VK_BUFFERS 256
@@ -162,23 +160,23 @@ struct vk_device_struct {
162
  vk_matmul_pipeline pipeline_matmul_f16_f32;
163
  vk_pipeline pipeline_matmul_split_k_reduce;
164
 
165
- vk_matmul_pipeline pipeline_dequant_mul_mat_mat[VK_NUM_TYPES];
166
 
167
  vk_matmul_pipeline pipeline_matmul_id_f32;
168
  vk_matmul_pipeline pipeline_matmul_id_f16;
169
  vk_matmul_pipeline pipeline_matmul_id_f16_f32;
170
 
171
- vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[VK_NUM_TYPES];
172
 
173
- vk_pipeline pipeline_dequant[VK_NUM_TYPES];
174
- vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[VK_NUM_TYPES];
175
- vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[VK_NUM_TYPES];
176
- vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[VK_NUM_TYPES];
177
 
178
  vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
179
  vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
180
- vk_pipeline pipeline_get_rows[VK_NUM_TYPES];
181
- vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES];
182
  vk_pipeline pipeline_mul_f32;
183
  vk_pipeline pipeline_div_f32;
184
  vk_pipeline pipeline_add_f32;
@@ -1059,25 +1057,6 @@ static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& event
1059
  );
1060
  }
1061
 
1062
- static bool ggml_vk_build_shader(ggml_type type) {
1063
- switch(type) {
1064
- case GGML_TYPE_F16:
1065
- case GGML_TYPE_Q4_0:
1066
- case GGML_TYPE_Q4_1:
1067
- case GGML_TYPE_Q5_0:
1068
- case GGML_TYPE_Q5_1:
1069
- case GGML_TYPE_Q8_0:
1070
- case GGML_TYPE_Q2_K:
1071
- case GGML_TYPE_Q3_K:
1072
- case GGML_TYPE_Q4_K:
1073
- case GGML_TYPE_Q5_K:
1074
- case GGML_TYPE_Q6_K:
1075
- return true;
1076
- default:
1077
- return false;
1078
- }
1079
- }
1080
-
1081
  static void ggml_vk_load_shaders(vk_device& device) {
1082
  VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
1083
 
@@ -1112,6 +1091,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
1112
  device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
1113
  device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
1114
  device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
 
1115
 
1116
  device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1117
  device->pipeline_matmul_id_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
@@ -1126,6 +1106,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
1126
  device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
1127
  device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
1128
  device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
 
1129
 
1130
  if (device->fp16) {
1131
  ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
@@ -1226,6 +1207,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
1226
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1227
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1228
 
 
 
 
 
 
 
 
1229
  ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1230
  ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1231
  ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
@@ -1316,6 +1304,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
1316
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1317
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1318
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
 
 
 
 
 
 
 
1319
  } else {
1320
  ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1321
  ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1415,6 +1410,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
1415
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1416
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1417
 
 
 
 
 
 
 
 
1418
  ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1419
  ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1420
  ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
@@ -1505,6 +1507,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
1505
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1506
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1507
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
 
 
 
 
 
 
 
1508
  }
1509
 
1510
  // mul mat vec
@@ -1520,6 +1529,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
1520
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1521
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1522
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
 
1523
 
1524
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1525
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -1533,6 +1543,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
1533
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1534
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1535
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
 
1536
 
1537
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1538
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -1546,6 +1557,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
1546
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1547
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1548
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
 
1549
 
1550
  // dequant shaders
1551
  ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
@@ -1559,6 +1571,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
1559
  ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
1560
  ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1561
  ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
 
1562
 
1563
  // get_rows
1564
  ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@@ -1568,6 +1581,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
1568
  ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1569
  ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1570
  ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
 
1571
 
1572
  ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1573
  ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@@ -1576,6 +1590,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
1576
  ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1577
  ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1578
  ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
 
1579
 
1580
  ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
1581
 
@@ -2087,6 +2102,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
2087
  case GGML_TYPE_Q4_K:
2088
  case GGML_TYPE_Q5_K:
2089
  case GGML_TYPE_Q6_K:
 
2090
  break;
2091
  default:
2092
  return nullptr;
@@ -2123,6 +2139,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
2123
  case GGML_TYPE_Q4_K:
2124
  case GGML_TYPE_Q5_K:
2125
  case GGML_TYPE_Q6_K:
 
2126
  break;
2127
  default:
2128
  return nullptr;
@@ -2148,6 +2165,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
2148
  case GGML_TYPE_Q4_K:
2149
  case GGML_TYPE_Q5_K:
2150
  case GGML_TYPE_Q6_K:
 
2151
  break;
2152
  default:
2153
  return nullptr;
@@ -2181,6 +2199,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
2181
  case GGML_TYPE_Q4_K:
2182
  case GGML_TYPE_Q5_K:
2183
  case GGML_TYPE_Q6_K:
 
2184
  break;
2185
  default:
2186
  return nullptr;
@@ -2206,6 +2225,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
2206
  case GGML_TYPE_Q4_K:
2207
  case GGML_TYPE_Q5_K:
2208
  case GGML_TYPE_Q6_K:
 
2209
  break;
2210
  default:
2211
  return nullptr;
@@ -3431,7 +3451,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3431
 
3432
  const uint64_t nei0 = ids->ne[0];
3433
  const uint64_t nei1 = ids->ne[1];
3434
- GGML_ASSERT(nei0 * nei1 <= 2048);
3435
 
3436
  const uint32_t nbi1 = ids->nb[1];
3437
  const uint32_t nbi2 = ids->nb[2];
@@ -3443,8 +3463,6 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3443
 
3444
  const uint64_t n_as = ne02;
3445
 
3446
- GGML_ASSERT(n_as <= 8);
3447
-
3448
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3449
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3450
  ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
@@ -4623,22 +4641,22 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4623
  }
4624
  }
4625
 
4626
- ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it);
4627
  if (split_k > 1) {
4628
- ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
4629
 
4630
  if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
4631
  // Resize buffer
4632
  if (ctx->prealloc_split_k != nullptr) {
4633
  ggml_vk_destroy_buffer(ctx->prealloc_split_k);
4634
  }
4635
- ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
4636
  }
4637
  }
4638
 
4639
- vk_buffer d_X = ggml_vk_create_buffer_check(ctx, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
4640
- vk_buffer d_Y = ggml_vk_create_buffer_check(ctx, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
4641
- vk_buffer d_D = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
4642
 
4643
  X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne);
4644
  Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne);
@@ -4665,12 +4683,12 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4665
  }
4666
  }
4667
 
4668
- ggml_vk_buffer_write(ctx, d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
4669
- ggml_vk_buffer_write(ctx, d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
4670
 
4671
  vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
4672
  for (size_t i = 0; i < num_it; i++) {
4673
- ggml_vk_ctx_begin(ctx, subctx);
4674
  ggml_vk_matmul(
4675
  ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k),
4676
  m, n, k,
@@ -4689,7 +4707,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4689
  double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
4690
 
4691
  // copy dst to host
4692
- ggml_vk_buffer_read(ctx, d_D, 0, d, sizeof(float) * d_ne);
4693
 
4694
  float * d_chk = (float *) malloc(sizeof(float) * d_ne);
4695
 
@@ -4765,7 +4783,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4765
 
4766
  if (split_k > 1) {
4767
  float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
4768
- ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
4769
 
4770
  std::cerr << "d_buf0: " << std::endl << std::endl;
4771
  ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
@@ -4785,8 +4803,8 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4785
 
4786
  free(d_chk);
4787
 
4788
- ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
4789
- ggml_vk_queue_cleanup(ctx, ctx->device->compute_queue);
4790
 
4791
  ggml_vk_destroy_buffer(d_X);
4792
  ggml_vk_destroy_buffer(d_Y);
@@ -4834,88 +4852,21 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
4834
  }
4835
  }
4836
 
4837
- static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) {
4838
- VK_LOG_DEBUG("ggml_vk_test_transfer(" << ne << ")");
4839
- // Check transfers are correct
4840
- vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
4841
-
4842
- float * x;
4843
- float * y;
4844
- if (pinned) {
4845
- x = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
4846
- y = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
4847
- } else {
4848
- x = (float *) malloc(sizeof(float) * ne);
4849
- y = (float *) malloc(sizeof(float) * ne);
4850
- }
4851
-
4852
- for (size_t i = 0; i < ne; i++) {
4853
- x[i] = rand() / (float)RAND_MAX;
4854
- }
4855
-
4856
- vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
4857
- ggml_vk_ctx_begin(ctx, subctx);
4858
-
4859
- auto begin = std::chrono::high_resolution_clock::now();
4860
-
4861
- ggml_vk_buffer_write_async(ctx, subctx, buffer, 0, x, sizeof(float) * ne);
4862
-
4863
- for (auto& cpy : subctx->in_memcpys) {
4864
- memcpy(cpy.dst, cpy.src, cpy.n);
4865
- }
4866
- subctx->in_memcpys.clear();
4867
-
4868
- ggml_vk_ctx_end(subctx);
4869
- ggml_vk_submit(subctx, ctx->fence);
4870
- VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
4871
- ctx->device->device.resetFences({ ctx->fence });
4872
-
4873
- auto end = std::chrono::high_resolution_clock::now();
4874
-
4875
- double ms_to_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
4876
-
4877
- ggml_vk_ctx_begin(ctx, subctx);
4878
-
4879
- begin = std::chrono::high_resolution_clock::now();
4880
-
4881
- ggml_vk_buffer_read_async(ctx, subctx, buffer, 0, y, sizeof(float) * ne);
4882
-
4883
- ggml_vk_ctx_end(subctx);
4884
- ggml_vk_submit(subctx, ctx->fence);
4885
- VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
4886
- ctx->device->device.resetFences({ ctx->fence });
4887
-
4888
- for (auto& cpy : subctx->out_memcpys) {
4889
- memcpy(cpy.dst, cpy.src, cpy.n);
4890
- }
4891
- subctx->out_memcpys.clear();
4892
-
4893
- end = std::chrono::high_resolution_clock::now();
4894
-
4895
- double ms_from_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
4896
 
4897
- double avg_err = 0.0;
4898
- for (size_t i = 0; i < ne; i++) {
4899
- avg_err += std::fabs(x[i] - y[i]);
 
4900
  }
4901
 
4902
- double kb = ne * sizeof(float) / 1024.0;
4903
-
4904
- std::cerr << "TEST TRANSFER " << kb << " KB to_gpu " << ms_to_gpu << "ms (" << kb / ms_to_gpu * 1000.0 / 1024.0 << " MB/s) from_gpu " << ms_from_gpu << "ms (" << kb / ms_from_gpu * 1000.0 / 1024.0 << " MB/s) avg_err=" << avg_err / ne << std::endl;
4905
-
4906
- ggml_vk_destroy_buffer(buffer);
4907
 
4908
- if (pinned) {
4909
- ggml_vk_host_free(ctx, x);
4910
- ggml_vk_host_free(ctx, y);
4911
- } else {
4912
- free(x);
4913
- free(y);
4914
- }
4915
- }
4916
 
4917
- static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
4918
- ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
4919
  }
4920
 
4921
  static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
@@ -4925,24 +4876,26 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
4925
  const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
4926
  float * x = (float *) malloc(x_sz);
4927
  void * qx = malloc(qx_sz);
4928
- vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
4929
- vk_buffer x_buf = ggml_vk_create_buffer_check(ctx, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
 
4930
  ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16);
4931
 
4932
  for (size_t i = 0; i < ne; i++) {
4933
  x[i] = rand() / (float)RAND_MAX;
4934
  }
4935
 
4936
- vk_pipeline p = ctx->device->pipeline_dequant[quant];
4937
 
4938
  ggml_vk_quantize_data(x, qx, ne, quant);
 
4939
 
4940
- ggml_pipeline_allocate_descriptor_sets(ctx, p, 1);
4941
 
4942
- ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz);
4943
 
4944
  vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
4945
- ggml_vk_ctx_begin(ctx, subctx);
4946
  const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
4947
  ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
4948
  ggml_vk_ctx_end(subctx);
@@ -4956,13 +4909,13 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
4956
  auto end = std::chrono::high_resolution_clock::now();
4957
 
4958
  double ms_dequant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
4959
- ggml_vk_buffer_read(ctx, x_buf, 0, x_chk, x_sz_f16);
4960
 
4961
  int first_err = -1;
4962
 
4963
  double avg_err = 0.0;
4964
  for (size_t i = 0; i < ne; i++) {
4965
- double error = std::fabs(x[i] - ggml_fp16_to_fp32(x_chk[i]));
4966
  avg_err += error;
4967
 
4968
  if (first_err < 0 && error > 0.05) {
@@ -4982,7 +4935,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
4982
  }
4983
  std::cerr << std::endl << "Expected result: " << std::endl << std::endl;
4984
  for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) {
4985
- std::cerr << x[i] << ", ";
4986
  }
4987
  std::cerr << std::endl;
4988
  }
@@ -4992,6 +4945,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
4992
 
4993
  free(x);
4994
  free(qx);
 
4995
  free(x_chk);
4996
  }
4997
 
@@ -5040,9 +4994,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5040
  float * x = (float *) malloc(x_sz);
5041
  float * y = (float *) malloc(y_sz);
5042
  void * qx = malloc(qx_sz);
5043
- vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
5044
- vk_buffer y_buf = ggml_vk_create_buffer_check(ctx, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
5045
- vk_buffer d_buf = ggml_vk_create_buffer_check(ctx, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
5046
  float * d = (float *) malloc(d_sz);
5047
  float * d_chk = (float *) malloc(d_sz);
5048
 
@@ -5057,25 +5011,25 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5057
  y[i] = (i % k == i / k) ? 1.0f : 0.0f;
5058
  }
5059
 
5060
- ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it);
5061
  if (split_k > 1) {
5062
- ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
5063
 
5064
  if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
5065
  // Resize buffer
5066
  if (ctx->prealloc_split_k != nullptr) {
5067
  ggml_vk_destroy_buffer(ctx->prealloc_split_k);
5068
  }
5069
- ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
5070
  }
5071
  }
5072
 
5073
- ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz);
5074
- ggml_vk_buffer_write(ctx, y_buf, 0, y, y_sz);
5075
 
5076
  vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
5077
  for (size_t i = 0; i < num_it; i++) {
5078
- ggml_vk_ctx_begin(ctx, subctx);
5079
  ggml_vk_matmul(
5080
  ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k),
5081
  m, n, k,
@@ -5094,7 +5048,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5094
  auto end = std::chrono::high_resolution_clock::now();
5095
 
5096
  double time_ms = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
5097
- ggml_vk_buffer_read(ctx, d_buf, 0, d, d_sz);
5098
 
5099
  ggml_init_params iparams = {
5100
  /*.mem_size =*/ 1024*1024*1024,
@@ -5149,7 +5103,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5149
 
5150
  if (split_k > 1) {
5151
  float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
5152
- ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
5153
 
5154
  std::cerr << "d_buf0: " << std::endl << std::endl;
5155
  ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
@@ -5302,12 +5256,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5302
 
5303
  static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5304
  #if defined(GGML_VULKAN_RUN_TESTS)
5305
- ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
5306
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
5307
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
5308
- ggml_vk_test_transfer(ctx, 8192 * 1000, false);
5309
- ggml_vk_test_transfer(ctx, 8192 * 1000, true);
5310
-
5311
  ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
5312
  ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0);
5313
  ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1);
@@ -5319,85 +5270,90 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5319
  ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_K);
5320
  ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K);
5321
  ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K);
 
5322
 
5323
  ggml_vk_test_matmul<ggml_fp16_t, ggml_fp16_t>(ctx, 512, 512, 100, 32, 100, 1, 2);
5324
 
5325
  ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 0);
5326
  ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 1);
5327
  ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 2);
5328
- ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0);
5329
- ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1);
5330
- ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2);
5331
 
5332
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_0);
5333
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_0);
5334
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_0);
5335
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0);
5336
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0);
5337
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0);
5338
 
5339
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_1);
5340
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_1);
5341
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_1);
5342
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1);
5343
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1);
5344
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1);
5345
 
5346
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_0);
5347
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_0);
5348
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_0);
5349
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0);
5350
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0);
5351
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0);
5352
 
5353
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_1);
5354
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_1);
5355
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_1);
5356
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1);
5357
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1);
5358
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1);
5359
 
5360
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q8_0);
5361
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q8_0);
5362
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q8_0);
5363
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0);
5364
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
5365
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
5366
 
5367
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
5368
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
5369
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
5370
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
5371
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
5372
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
5373
 
5374
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
5375
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
5376
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
5377
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
5378
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
5379
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
5380
 
5381
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
5382
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
5383
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
5384
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
5385
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
5386
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
5387
 
5388
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
5389
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
5390
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
5391
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
5392
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
5393
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
5394
 
5395
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
5396
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
5397
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
5398
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
5399
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
5400
- ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
 
 
 
 
5401
 
5402
  std::cerr << std::endl;
5403
 
@@ -5429,9 +5385,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5429
  ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0);
5430
  ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1);
5431
  ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2);
5432
- ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
5433
- ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
5434
- ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
5435
  std::cerr << std::endl;
5436
  }
5437
 
@@ -6263,6 +6219,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
6263
  case GGML_TYPE_Q4_K:
6264
  case GGML_TYPE_Q5_K:
6265
  case GGML_TYPE_Q6_K:
 
6266
  break;
6267
  default:
6268
  return false;
@@ -6291,6 +6248,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
6291
  case GGML_TYPE_Q5_0:
6292
  case GGML_TYPE_Q5_1:
6293
  case GGML_TYPE_Q8_0:
 
6294
  return true;
6295
  default:
6296
  return false;
 
38
  #define VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI 1
39
  #define VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE 2
40
 
 
 
41
  #define GGML_VK_MAX_NODES 8192
42
 
43
  #define MAX_VK_BUFFERS 256
 
160
  vk_matmul_pipeline pipeline_matmul_f16_f32;
161
  vk_pipeline pipeline_matmul_split_k_reduce;
162
 
163
+ vk_matmul_pipeline pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];
164
 
165
  vk_matmul_pipeline pipeline_matmul_id_f32;
166
  vk_matmul_pipeline pipeline_matmul_id_f16;
167
  vk_matmul_pipeline pipeline_matmul_id_f16_f32;
168
 
169
+ vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT];
170
 
171
+ vk_pipeline pipeline_dequant[GGML_TYPE_COUNT];
172
+ vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT];
173
+ vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT];
174
+ vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_COUNT];
175
 
176
  vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
177
  vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
178
+ vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
179
+ vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
180
  vk_pipeline pipeline_mul_f32;
181
  vk_pipeline pipeline_div_f32;
182
  vk_pipeline pipeline_add_f32;
 
1057
  );
1058
  }
1059
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1060
  static void ggml_vk_load_shaders(vk_device& device) {
1061
  VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
1062
 
 
1091
  device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
1092
  device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
1093
  device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
1094
+ device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
1095
 
1096
  device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1097
  device->pipeline_matmul_id_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
 
1106
  device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
1107
  device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
1108
  device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
1109
+ device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
1110
 
1111
  if (device->fp16) {
1112
  ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
 
1207
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1208
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1209
 
1210
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->l, "matmul_iq4_nl_f32_l", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1211
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->m, "matmul_iq4_nl_f32_m", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1212
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->s, "matmul_iq4_nl_f32_s", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1213
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_l, "matmul_iq4_nl_f32_aligned_l", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1214
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_m, "matmul_iq4_nl_f32_aligned_m", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1215
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_s, "matmul_iq4_nl_f32_aligned_s", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1216
+
1217
  ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1218
  ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1219
  ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
 
1304
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1305
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1306
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1307
+
1308
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->l, "matmul_id_iq4_nl_f32_l", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1309
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->m, "matmul_id_iq4_nl_f32_m", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1310
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->s, "matmul_id_iq4_nl_f32_s", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1311
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_l, "matmul_id_iq4_nl_f32_aligned_l", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1312
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_m, "matmul_id_iq4_nl_f32_aligned_m", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1313
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_s, "matmul_id_iq4_nl_f32_aligned_s", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1314
  } else {
1315
  ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1316
  ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
 
1410
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1411
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1412
 
1413
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->l, "matmul_iq4_nl_f32_l", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1414
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->m, "matmul_iq4_nl_f32_m", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1415
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->s, "matmul_iq4_nl_f32_s", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1416
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_l, "matmul_iq4_nl_f32_aligned_l", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1417
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_m, "matmul_iq4_nl_f32_aligned_m", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1418
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_s, "matmul_iq4_nl_f32_aligned_s", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1419
+
1420
  ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1421
  ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1422
  ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
 
1507
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1508
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1509
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1510
+
1511
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->l, "matmul_id_iq4_nl_f32_l", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1512
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->m, "matmul_id_iq4_nl_f32_m", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1513
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->s, "matmul_id_iq4_nl_f32_s", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1514
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_l, "matmul_id_iq4_nl_f32_aligned_l", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
1515
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_m, "matmul_id_iq4_nl_f32_aligned_m", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1516
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_s, "matmul_id_iq4_nl_f32_aligned_s", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1517
  }
1518
 
1519
  // mul mat vec
 
1529
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1530
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1531
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1532
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1533
 
1534
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1535
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
 
1543
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1544
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1545
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1546
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1547
 
1548
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1549
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
 
1557
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1558
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1559
  ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1560
+ ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1561
 
1562
  // dequant shaders
1563
  ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
 
1571
  ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
1572
  ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1573
  ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1574
+ ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL], "dequant_iq4_nl", dequant_iq4_nl_len, dequant_iq4_nl_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1575
 
1576
  // get_rows
1577
  ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
 
1581
  ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1582
  ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1583
  ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1584
+ ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1585
 
1586
  ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
1587
  ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
 
1590
  ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1591
  ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1592
  ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1593
+ ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
1594
 
1595
  ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
1596
 
 
2102
  case GGML_TYPE_Q4_K:
2103
  case GGML_TYPE_Q5_K:
2104
  case GGML_TYPE_Q6_K:
2105
+ case GGML_TYPE_IQ4_NL:
2106
  break;
2107
  default:
2108
  return nullptr;
 
2139
  case GGML_TYPE_Q4_K:
2140
  case GGML_TYPE_Q5_K:
2141
  case GGML_TYPE_Q6_K:
2142
+ case GGML_TYPE_IQ4_NL:
2143
  break;
2144
  default:
2145
  return nullptr;
 
2165
  case GGML_TYPE_Q4_K:
2166
  case GGML_TYPE_Q5_K:
2167
  case GGML_TYPE_Q6_K:
2168
+ case GGML_TYPE_IQ4_NL:
2169
  break;
2170
  default:
2171
  return nullptr;
 
2199
  case GGML_TYPE_Q4_K:
2200
  case GGML_TYPE_Q5_K:
2201
  case GGML_TYPE_Q6_K:
2202
+ case GGML_TYPE_IQ4_NL:
2203
  break;
2204
  default:
2205
  return nullptr;
 
2225
  case GGML_TYPE_Q4_K:
2226
  case GGML_TYPE_Q5_K:
2227
  case GGML_TYPE_Q6_K:
2228
+ case GGML_TYPE_IQ4_NL:
2229
  break;
2230
  default:
2231
  return nullptr;
 
3451
 
3452
  const uint64_t nei0 = ids->ne[0];
3453
  const uint64_t nei1 = ids->ne[1];
3454
+ GGML_ASSERT(nei0 * nei1 <= 3072);
3455
 
3456
  const uint32_t nbi1 = ids->nb[1];
3457
  const uint32_t nbi2 = ids->nb[2];
 
3463
 
3464
  const uint64_t n_as = ne02;
3465
 
 
 
3466
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3467
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3468
  ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
 
4641
  }
4642
  }
4643
 
4644
+ ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it);
4645
  if (split_k > 1) {
4646
+ ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
4647
 
4648
  if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
4649
  // Resize buffer
4650
  if (ctx->prealloc_split_k != nullptr) {
4651
  ggml_vk_destroy_buffer(ctx->prealloc_split_k);
4652
  }
4653
+ ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
4654
  }
4655
  }
4656
 
4657
+ vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
4658
+ vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
4659
+ vk_buffer d_D = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
4660
 
4661
  X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne);
4662
  Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne);
 
4683
  }
4684
  }
4685
 
4686
+ ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
4687
+ ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
4688
 
4689
  vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
4690
  for (size_t i = 0; i < num_it; i++) {
4691
+ ggml_vk_ctx_begin(ctx->device, subctx);
4692
  ggml_vk_matmul(
4693
  ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k),
4694
  m, n, k,
 
4707
  double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
4708
 
4709
  // copy dst to host
4710
+ ggml_vk_buffer_read(d_D, 0, d, sizeof(float) * d_ne);
4711
 
4712
  float * d_chk = (float *) malloc(sizeof(float) * d_ne);
4713
 
 
4783
 
4784
  if (split_k > 1) {
4785
  float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
4786
+ ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
4787
 
4788
  std::cerr << "d_buf0: " << std::endl << std::endl;
4789
  ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
 
4803
 
4804
  free(d_chk);
4805
 
4806
+ ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
4807
+ ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
4808
 
4809
  ggml_vk_destroy_buffer(d_X);
4810
  ggml_vk_destroy_buffer(d_Y);
 
4852
  }
4853
  }
4854
 
4855
+ static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
4856
+ ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
4857
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4858
 
4859
+ static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant) {
4860
+ if (quant == GGML_TYPE_F32) {
4861
+ memcpy(to, from, sizeof(float) * ne);
4862
+ return;
4863
  }
4864
 
4865
+ ggml_type_traits_t tt = ggml_internal_get_type_traits(quant);
 
 
 
 
4866
 
4867
+ ggml_to_float_t dequant_fn = tt.to_float;
 
 
 
 
 
 
 
4868
 
4869
+ dequant_fn(from, to, ne);
 
4870
  }
4871
 
4872
  static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
 
4876
  const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
4877
  float * x = (float *) malloc(x_sz);
4878
  void * qx = malloc(qx_sz);
4879
+ vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
4880
+ vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
4881
+ float * x_ref = (float *) malloc(x_sz);
4882
  ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16);
4883
 
4884
  for (size_t i = 0; i < ne; i++) {
4885
  x[i] = rand() / (float)RAND_MAX;
4886
  }
4887
 
4888
+ vk_pipeline p = ggml_vk_get_to_fp16(ctx, quant);
4889
 
4890
  ggml_vk_quantize_data(x, qx, ne, quant);
4891
+ ggml_vk_dequantize_data(qx, x_ref, ne, quant);
4892
 
4893
+ ggml_pipeline_allocate_descriptor_sets(ctx->device, p, 1);
4894
 
4895
+ ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
4896
 
4897
  vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
4898
+ ggml_vk_ctx_begin(ctx->device, subctx);
4899
  const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
4900
  ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
4901
  ggml_vk_ctx_end(subctx);
 
4909
  auto end = std::chrono::high_resolution_clock::now();
4910
 
4911
  double ms_dequant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
4912
+ ggml_vk_buffer_read(x_buf, 0, x_chk, x_sz_f16);
4913
 
4914
  int first_err = -1;
4915
 
4916
  double avg_err = 0.0;
4917
  for (size_t i = 0; i < ne; i++) {
4918
+ double error = std::fabs(x_ref[i] - ggml_fp16_to_fp32(x_chk[i]));
4919
  avg_err += error;
4920
 
4921
  if (first_err < 0 && error > 0.05) {
 
4935
  }
4936
  std::cerr << std::endl << "Expected result: " << std::endl << std::endl;
4937
  for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) {
4938
+ std::cerr << x_ref[i] << ", ";
4939
  }
4940
  std::cerr << std::endl;
4941
  }
 
4945
 
4946
  free(x);
4947
  free(qx);
4948
+ free(x_ref);
4949
  free(x_chk);
4950
  }
4951
 
 
4994
  float * x = (float *) malloc(x_sz);
4995
  float * y = (float *) malloc(y_sz);
4996
  void * qx = malloc(qx_sz);
4997
+ vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
4998
+ vk_buffer y_buf = ggml_vk_create_buffer_check(ctx->device, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
4999
+ vk_buffer d_buf = ggml_vk_create_buffer_check(ctx->device, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
5000
  float * d = (float *) malloc(d_sz);
5001
  float * d_chk = (float *) malloc(d_sz);
5002
 
 
5011
  y[i] = (i % k == i / k) ? 1.0f : 0.0f;
5012
  }
5013
 
5014
+ ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it);
5015
  if (split_k > 1) {
5016
+ ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
5017
 
5018
  if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
5019
  // Resize buffer
5020
  if (ctx->prealloc_split_k != nullptr) {
5021
  ggml_vk_destroy_buffer(ctx->prealloc_split_k);
5022
  }
5023
+ ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
5024
  }
5025
  }
5026
 
5027
+ ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
5028
+ ggml_vk_buffer_write(y_buf, 0, y, y_sz);
5029
 
5030
  vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
5031
  for (size_t i = 0; i < num_it; i++) {
5032
+ ggml_vk_ctx_begin(ctx->device, subctx);
5033
  ggml_vk_matmul(
5034
  ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k),
5035
  m, n, k,
 
5048
  auto end = std::chrono::high_resolution_clock::now();
5049
 
5050
  double time_ms = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
5051
+ ggml_vk_buffer_read(d_buf, 0, d, d_sz);
5052
 
5053
  ggml_init_params iparams = {
5054
  /*.mem_size =*/ 1024*1024*1024,
 
5103
 
5104
  if (split_k > 1) {
5105
  float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
5106
+ ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
5107
 
5108
  std::cerr << "d_buf0: " << std::endl << std::endl;
5109
  ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
 
5256
 
5257
  static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5258
  #if defined(GGML_VULKAN_RUN_TESTS)
5259
+ ctx->staging = ggml_vk_create_buffer_check(ctx->device, 100ul * 1024ul * 1024ul,
5260
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
5261
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
 
 
 
5262
  ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
5263
  ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0);
5264
  ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1);
 
5270
  ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_K);
5271
  ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K);
5272
  ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K);
5273
+ ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_IQ4_NL);
5274
 
5275
  ggml_vk_test_matmul<ggml_fp16_t, ggml_fp16_t>(ctx, 512, 512, 100, 32, 100, 1, 2);
5276
 
5277
  ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 0);
5278
  ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 1);
5279
  ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 2);
5280
+ // ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0);
5281
+ // ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1);
5282
+ // ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2);
5283
 
5284
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_0);
5285
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_0);
5286
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_0);
5287
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0);
5288
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0);
5289
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0);
5290
 
5291
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_1);
5292
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_1);
5293
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_1);
5294
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1);
5295
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1);
5296
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1);
5297
 
5298
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_0);
5299
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_0);
5300
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_0);
5301
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0);
5302
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0);
5303
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0);
5304
 
5305
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_1);
5306
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_1);
5307
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_1);
5308
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1);
5309
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1);
5310
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1);
5311
 
5312
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q8_0);
5313
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q8_0);
5314
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q8_0);
5315
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0);
5316
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
5317
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
5318
 
5319
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
5320
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
5321
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
5322
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
5323
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
5324
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
5325
 
5326
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
5327
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
5328
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
5329
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
5330
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
5331
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
5332
 
5333
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
5334
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
5335
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
5336
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
5337
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
5338
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
5339
 
5340
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
5341
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
5342
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
5343
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
5344
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
5345
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
5346
 
5347
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
5348
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
5349
  ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
5350
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
5351
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
5352
+ // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
5353
+
5354
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_IQ4_NL);
5355
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_IQ4_NL);
5356
+ ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_IQ4_NL);
5357
 
5358
  std::cerr << std::endl;
5359
 
 
5385
  ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0);
5386
  ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1);
5387
  ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2);
5388
+ // ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
5389
+ // ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
5390
+ // ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
5391
  std::cerr << std::endl;
5392
  }
5393
 
 
6219
  case GGML_TYPE_Q4_K:
6220
  case GGML_TYPE_Q5_K:
6221
  case GGML_TYPE_Q6_K:
6222
+ case GGML_TYPE_IQ4_NL:
6223
  break;
6224
  default:
6225
  return false;
 
6248
  case GGML_TYPE_Q5_0:
6249
  case GGML_TYPE_Q5_1:
6250
  case GGML_TYPE_Q8_0:
6251
+ case GGML_TYPE_IQ4_NL:
6252
  return true;
6253
  default:
6254
  return false;