Spaces:
Running
Running
Commit ·
899145d
1
Parent(s): 1d65fea
Vulkan IQ4_NL Support (llama/8613)
Browse files* Fix Vulkan matmul tests compile errors
* Add Vulkan IQ4_NL support
* Fix Vulkan DeepSeek-Coder-V2-Lite MoE support
- ggml/src/ggml-vulkan.cpp +140 -182
ggml/src/ggml-vulkan.cpp
CHANGED
|
@@ -38,8 +38,6 @@
|
|
| 38 |
#define VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI 1
|
| 39 |
#define VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE 2
|
| 40 |
|
| 41 |
-
#define VK_NUM_TYPES 16
|
| 42 |
-
|
| 43 |
#define GGML_VK_MAX_NODES 8192
|
| 44 |
|
| 45 |
#define MAX_VK_BUFFERS 256
|
|
@@ -162,23 +160,23 @@ struct vk_device_struct {
|
|
| 162 |
vk_matmul_pipeline pipeline_matmul_f16_f32;
|
| 163 |
vk_pipeline pipeline_matmul_split_k_reduce;
|
| 164 |
|
| 165 |
-
vk_matmul_pipeline pipeline_dequant_mul_mat_mat[
|
| 166 |
|
| 167 |
vk_matmul_pipeline pipeline_matmul_id_f32;
|
| 168 |
vk_matmul_pipeline pipeline_matmul_id_f16;
|
| 169 |
vk_matmul_pipeline pipeline_matmul_id_f16_f32;
|
| 170 |
|
| 171 |
-
vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[
|
| 172 |
|
| 173 |
-
vk_pipeline pipeline_dequant[
|
| 174 |
-
vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[
|
| 175 |
-
vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[
|
| 176 |
-
vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[
|
| 177 |
|
| 178 |
vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
|
| 179 |
vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
|
| 180 |
-
vk_pipeline pipeline_get_rows[
|
| 181 |
-
vk_pipeline pipeline_get_rows_f32[
|
| 182 |
vk_pipeline pipeline_mul_f32;
|
| 183 |
vk_pipeline pipeline_div_f32;
|
| 184 |
vk_pipeline pipeline_add_f32;
|
|
@@ -1059,25 +1057,6 @@ static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& event
|
|
| 1059 |
);
|
| 1060 |
}
|
| 1061 |
|
| 1062 |
-
static bool ggml_vk_build_shader(ggml_type type) {
|
| 1063 |
-
switch(type) {
|
| 1064 |
-
case GGML_TYPE_F16:
|
| 1065 |
-
case GGML_TYPE_Q4_0:
|
| 1066 |
-
case GGML_TYPE_Q4_1:
|
| 1067 |
-
case GGML_TYPE_Q5_0:
|
| 1068 |
-
case GGML_TYPE_Q5_1:
|
| 1069 |
-
case GGML_TYPE_Q8_0:
|
| 1070 |
-
case GGML_TYPE_Q2_K:
|
| 1071 |
-
case GGML_TYPE_Q3_K:
|
| 1072 |
-
case GGML_TYPE_Q4_K:
|
| 1073 |
-
case GGML_TYPE_Q5_K:
|
| 1074 |
-
case GGML_TYPE_Q6_K:
|
| 1075 |
-
return true;
|
| 1076 |
-
default:
|
| 1077 |
-
return false;
|
| 1078 |
-
}
|
| 1079 |
-
}
|
| 1080 |
-
|
| 1081 |
static void ggml_vk_load_shaders(vk_device& device) {
|
| 1082 |
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
|
| 1083 |
|
|
@@ -1112,6 +1091,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1112 |
device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1113 |
device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1114 |
device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
|
|
|
| 1115 |
|
| 1116 |
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1117 |
device->pipeline_matmul_id_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
|
@@ -1126,6 +1106,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1126 |
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1127 |
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1128 |
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
|
|
|
| 1129 |
|
| 1130 |
if (device->fp16) {
|
| 1131 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
|
@@ -1226,6 +1207,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1226 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1227 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1229 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
| 1230 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
| 1231 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
|
@@ -1316,6 +1304,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1316 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1317 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1318 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1319 |
} else {
|
| 1320 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
| 1321 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
|
@@ -1415,6 +1410,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1415 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1416 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1417 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1418 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
| 1419 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
| 1420 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
|
@@ -1505,6 +1507,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1505 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1506 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1507 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1508 |
}
|
| 1509 |
|
| 1510 |
// mul mat vec
|
|
@@ -1520,6 +1529,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1520 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1521 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1522 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
|
|
| 1523 |
|
| 1524 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1525 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
@@ -1533,6 +1543,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1533 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1534 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1535 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
|
|
| 1536 |
|
| 1537 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1538 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
@@ -1546,6 +1557,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1546 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1547 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1548 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
|
|
| 1549 |
|
| 1550 |
// dequant shaders
|
| 1551 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
|
@@ -1559,6 +1571,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1559 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
| 1560 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
| 1561 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
|
|
|
| 1562 |
|
| 1563 |
// get_rows
|
| 1564 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
|
@@ -1568,6 +1581,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1568 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1569 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1570 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
|
|
|
| 1571 |
|
| 1572 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
| 1573 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
|
@@ -1576,6 +1590,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1576 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1577 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1578 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
|
|
|
| 1579 |
|
| 1580 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
|
| 1581 |
|
|
@@ -2087,6 +2102,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
|
|
| 2087 |
case GGML_TYPE_Q4_K:
|
| 2088 |
case GGML_TYPE_Q5_K:
|
| 2089 |
case GGML_TYPE_Q6_K:
|
|
|
|
| 2090 |
break;
|
| 2091 |
default:
|
| 2092 |
return nullptr;
|
|
@@ -2123,6 +2139,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
| 2123 |
case GGML_TYPE_Q4_K:
|
| 2124 |
case GGML_TYPE_Q5_K:
|
| 2125 |
case GGML_TYPE_Q6_K:
|
|
|
|
| 2126 |
break;
|
| 2127 |
default:
|
| 2128 |
return nullptr;
|
|
@@ -2148,6 +2165,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
|
|
| 2148 |
case GGML_TYPE_Q4_K:
|
| 2149 |
case GGML_TYPE_Q5_K:
|
| 2150 |
case GGML_TYPE_Q6_K:
|
|
|
|
| 2151 |
break;
|
| 2152 |
default:
|
| 2153 |
return nullptr;
|
|
@@ -2181,6 +2199,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
|
|
| 2181 |
case GGML_TYPE_Q4_K:
|
| 2182 |
case GGML_TYPE_Q5_K:
|
| 2183 |
case GGML_TYPE_Q6_K:
|
|
|
|
| 2184 |
break;
|
| 2185 |
default:
|
| 2186 |
return nullptr;
|
|
@@ -2206,6 +2225,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
|
|
| 2206 |
case GGML_TYPE_Q4_K:
|
| 2207 |
case GGML_TYPE_Q5_K:
|
| 2208 |
case GGML_TYPE_Q6_K:
|
|
|
|
| 2209 |
break;
|
| 2210 |
default:
|
| 2211 |
return nullptr;
|
|
@@ -3431,7 +3451,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
| 3431 |
|
| 3432 |
const uint64_t nei0 = ids->ne[0];
|
| 3433 |
const uint64_t nei1 = ids->ne[1];
|
| 3434 |
-
GGML_ASSERT(nei0 * nei1 <=
|
| 3435 |
|
| 3436 |
const uint32_t nbi1 = ids->nb[1];
|
| 3437 |
const uint32_t nbi2 = ids->nb[2];
|
|
@@ -3443,8 +3463,6 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
| 3443 |
|
| 3444 |
const uint64_t n_as = ne02;
|
| 3445 |
|
| 3446 |
-
GGML_ASSERT(n_as <= 8);
|
| 3447 |
-
|
| 3448 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
| 3449 |
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
| 3450 |
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
|
@@ -4623,22 +4641,22 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
| 4623 |
}
|
| 4624 |
}
|
| 4625 |
|
| 4626 |
-
ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it);
|
| 4627 |
if (split_k > 1) {
|
| 4628 |
-
ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
| 4629 |
|
| 4630 |
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
| 4631 |
// Resize buffer
|
| 4632 |
if (ctx->prealloc_split_k != nullptr) {
|
| 4633 |
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
| 4634 |
}
|
| 4635 |
-
ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 4636 |
}
|
| 4637 |
}
|
| 4638 |
|
| 4639 |
-
vk_buffer d_X = ggml_vk_create_buffer_check(ctx, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 4640 |
-
vk_buffer d_Y = ggml_vk_create_buffer_check(ctx, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 4641 |
-
vk_buffer d_D = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 4642 |
|
| 4643 |
X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne);
|
| 4644 |
Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne);
|
|
@@ -4665,12 +4683,12 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
| 4665 |
}
|
| 4666 |
}
|
| 4667 |
|
| 4668 |
-
ggml_vk_buffer_write(
|
| 4669 |
-
ggml_vk_buffer_write(
|
| 4670 |
|
| 4671 |
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
| 4672 |
for (size_t i = 0; i < num_it; i++) {
|
| 4673 |
-
ggml_vk_ctx_begin(ctx, subctx);
|
| 4674 |
ggml_vk_matmul(
|
| 4675 |
ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k),
|
| 4676 |
m, n, k,
|
|
@@ -4689,7 +4707,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
| 4689 |
double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
| 4690 |
|
| 4691 |
// copy dst to host
|
| 4692 |
-
ggml_vk_buffer_read(
|
| 4693 |
|
| 4694 |
float * d_chk = (float *) malloc(sizeof(float) * d_ne);
|
| 4695 |
|
|
@@ -4765,7 +4783,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
| 4765 |
|
| 4766 |
if (split_k > 1) {
|
| 4767 |
float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
|
| 4768 |
-
ggml_vk_buffer_read(ctx
|
| 4769 |
|
| 4770 |
std::cerr << "d_buf0: " << std::endl << std::endl;
|
| 4771 |
ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
|
@@ -4785,8 +4803,8 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
| 4785 |
|
| 4786 |
free(d_chk);
|
| 4787 |
|
| 4788 |
-
ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
|
| 4789 |
-
ggml_vk_queue_cleanup(ctx, ctx->device->compute_queue);
|
| 4790 |
|
| 4791 |
ggml_vk_destroy_buffer(d_X);
|
| 4792 |
ggml_vk_destroy_buffer(d_Y);
|
|
@@ -4834,88 +4852,21 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
|
|
| 4834 |
}
|
| 4835 |
}
|
| 4836 |
|
| 4837 |
-
static void
|
| 4838 |
-
|
| 4839 |
-
|
| 4840 |
-
vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 4841 |
-
|
| 4842 |
-
float * x;
|
| 4843 |
-
float * y;
|
| 4844 |
-
if (pinned) {
|
| 4845 |
-
x = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
|
| 4846 |
-
y = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
|
| 4847 |
-
} else {
|
| 4848 |
-
x = (float *) malloc(sizeof(float) * ne);
|
| 4849 |
-
y = (float *) malloc(sizeof(float) * ne);
|
| 4850 |
-
}
|
| 4851 |
-
|
| 4852 |
-
for (size_t i = 0; i < ne; i++) {
|
| 4853 |
-
x[i] = rand() / (float)RAND_MAX;
|
| 4854 |
-
}
|
| 4855 |
-
|
| 4856 |
-
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
| 4857 |
-
ggml_vk_ctx_begin(ctx, subctx);
|
| 4858 |
-
|
| 4859 |
-
auto begin = std::chrono::high_resolution_clock::now();
|
| 4860 |
-
|
| 4861 |
-
ggml_vk_buffer_write_async(ctx, subctx, buffer, 0, x, sizeof(float) * ne);
|
| 4862 |
-
|
| 4863 |
-
for (auto& cpy : subctx->in_memcpys) {
|
| 4864 |
-
memcpy(cpy.dst, cpy.src, cpy.n);
|
| 4865 |
-
}
|
| 4866 |
-
subctx->in_memcpys.clear();
|
| 4867 |
-
|
| 4868 |
-
ggml_vk_ctx_end(subctx);
|
| 4869 |
-
ggml_vk_submit(subctx, ctx->fence);
|
| 4870 |
-
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
|
| 4871 |
-
ctx->device->device.resetFences({ ctx->fence });
|
| 4872 |
-
|
| 4873 |
-
auto end = std::chrono::high_resolution_clock::now();
|
| 4874 |
-
|
| 4875 |
-
double ms_to_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
| 4876 |
-
|
| 4877 |
-
ggml_vk_ctx_begin(ctx, subctx);
|
| 4878 |
-
|
| 4879 |
-
begin = std::chrono::high_resolution_clock::now();
|
| 4880 |
-
|
| 4881 |
-
ggml_vk_buffer_read_async(ctx, subctx, buffer, 0, y, sizeof(float) * ne);
|
| 4882 |
-
|
| 4883 |
-
ggml_vk_ctx_end(subctx);
|
| 4884 |
-
ggml_vk_submit(subctx, ctx->fence);
|
| 4885 |
-
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
|
| 4886 |
-
ctx->device->device.resetFences({ ctx->fence });
|
| 4887 |
-
|
| 4888 |
-
for (auto& cpy : subctx->out_memcpys) {
|
| 4889 |
-
memcpy(cpy.dst, cpy.src, cpy.n);
|
| 4890 |
-
}
|
| 4891 |
-
subctx->out_memcpys.clear();
|
| 4892 |
-
|
| 4893 |
-
end = std::chrono::high_resolution_clock::now();
|
| 4894 |
-
|
| 4895 |
-
double ms_from_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
| 4896 |
|
| 4897 |
-
|
| 4898 |
-
|
| 4899 |
-
|
|
|
|
| 4900 |
}
|
| 4901 |
|
| 4902 |
-
|
| 4903 |
-
|
| 4904 |
-
std::cerr << "TEST TRANSFER " << kb << " KB to_gpu " << ms_to_gpu << "ms (" << kb / ms_to_gpu * 1000.0 / 1024.0 << " MB/s) from_gpu " << ms_from_gpu << "ms (" << kb / ms_from_gpu * 1000.0 / 1024.0 << " MB/s) avg_err=" << avg_err / ne << std::endl;
|
| 4905 |
-
|
| 4906 |
-
ggml_vk_destroy_buffer(buffer);
|
| 4907 |
|
| 4908 |
-
|
| 4909 |
-
ggml_vk_host_free(ctx, x);
|
| 4910 |
-
ggml_vk_host_free(ctx, y);
|
| 4911 |
-
} else {
|
| 4912 |
-
free(x);
|
| 4913 |
-
free(y);
|
| 4914 |
-
}
|
| 4915 |
-
}
|
| 4916 |
|
| 4917 |
-
|
| 4918 |
-
ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
|
| 4919 |
}
|
| 4920 |
|
| 4921 |
static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
|
|
@@ -4925,24 +4876,26 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
| 4925 |
const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
|
| 4926 |
float * x = (float *) malloc(x_sz);
|
| 4927 |
void * qx = malloc(qx_sz);
|
| 4928 |
-
vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 4929 |
-
vk_buffer x_buf = ggml_vk_create_buffer_check(ctx, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
|
|
|
| 4930 |
ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16);
|
| 4931 |
|
| 4932 |
for (size_t i = 0; i < ne; i++) {
|
| 4933 |
x[i] = rand() / (float)RAND_MAX;
|
| 4934 |
}
|
| 4935 |
|
| 4936 |
-
vk_pipeline p = ctx
|
| 4937 |
|
| 4938 |
ggml_vk_quantize_data(x, qx, ne, quant);
|
|
|
|
| 4939 |
|
| 4940 |
-
ggml_pipeline_allocate_descriptor_sets(ctx, p, 1);
|
| 4941 |
|
| 4942 |
-
ggml_vk_buffer_write(
|
| 4943 |
|
| 4944 |
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
| 4945 |
-
ggml_vk_ctx_begin(ctx, subctx);
|
| 4946 |
const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
|
| 4947 |
ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
|
| 4948 |
ggml_vk_ctx_end(subctx);
|
|
@@ -4956,13 +4909,13 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
| 4956 |
auto end = std::chrono::high_resolution_clock::now();
|
| 4957 |
|
| 4958 |
double ms_dequant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
| 4959 |
-
ggml_vk_buffer_read(
|
| 4960 |
|
| 4961 |
int first_err = -1;
|
| 4962 |
|
| 4963 |
double avg_err = 0.0;
|
| 4964 |
for (size_t i = 0; i < ne; i++) {
|
| 4965 |
-
double error = std::fabs(
|
| 4966 |
avg_err += error;
|
| 4967 |
|
| 4968 |
if (first_err < 0 && error > 0.05) {
|
|
@@ -4982,7 +4935,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
| 4982 |
}
|
| 4983 |
std::cerr << std::endl << "Expected result: " << std::endl << std::endl;
|
| 4984 |
for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) {
|
| 4985 |
-
std::cerr <<
|
| 4986 |
}
|
| 4987 |
std::cerr << std::endl;
|
| 4988 |
}
|
|
@@ -4992,6 +4945,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
| 4992 |
|
| 4993 |
free(x);
|
| 4994 |
free(qx);
|
|
|
|
| 4995 |
free(x_chk);
|
| 4996 |
}
|
| 4997 |
|
|
@@ -5040,9 +4994,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
| 5040 |
float * x = (float *) malloc(x_sz);
|
| 5041 |
float * y = (float *) malloc(y_sz);
|
| 5042 |
void * qx = malloc(qx_sz);
|
| 5043 |
-
vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 5044 |
-
vk_buffer y_buf = ggml_vk_create_buffer_check(ctx, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 5045 |
-
vk_buffer d_buf = ggml_vk_create_buffer_check(ctx, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 5046 |
float * d = (float *) malloc(d_sz);
|
| 5047 |
float * d_chk = (float *) malloc(d_sz);
|
| 5048 |
|
|
@@ -5057,25 +5011,25 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
| 5057 |
y[i] = (i % k == i / k) ? 1.0f : 0.0f;
|
| 5058 |
}
|
| 5059 |
|
| 5060 |
-
ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it);
|
| 5061 |
if (split_k > 1) {
|
| 5062 |
-
ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
| 5063 |
|
| 5064 |
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
| 5065 |
// Resize buffer
|
| 5066 |
if (ctx->prealloc_split_k != nullptr) {
|
| 5067 |
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
| 5068 |
}
|
| 5069 |
-
ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 5070 |
}
|
| 5071 |
}
|
| 5072 |
|
| 5073 |
-
ggml_vk_buffer_write(
|
| 5074 |
-
ggml_vk_buffer_write(
|
| 5075 |
|
| 5076 |
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
| 5077 |
for (size_t i = 0; i < num_it; i++) {
|
| 5078 |
-
ggml_vk_ctx_begin(ctx, subctx);
|
| 5079 |
ggml_vk_matmul(
|
| 5080 |
ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k),
|
| 5081 |
m, n, k,
|
|
@@ -5094,7 +5048,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
| 5094 |
auto end = std::chrono::high_resolution_clock::now();
|
| 5095 |
|
| 5096 |
double time_ms = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
| 5097 |
-
ggml_vk_buffer_read(
|
| 5098 |
|
| 5099 |
ggml_init_params iparams = {
|
| 5100 |
/*.mem_size =*/ 1024*1024*1024,
|
|
@@ -5149,7 +5103,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
| 5149 |
|
| 5150 |
if (split_k > 1) {
|
| 5151 |
float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
|
| 5152 |
-
ggml_vk_buffer_read(ctx
|
| 5153 |
|
| 5154 |
std::cerr << "d_buf0: " << std::endl << std::endl;
|
| 5155 |
ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
|
@@ -5302,12 +5256,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
| 5302 |
|
| 5303 |
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
| 5304 |
#if defined(GGML_VULKAN_RUN_TESTS)
|
| 5305 |
-
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
| 5306 |
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
| 5307 |
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
| 5308 |
-
ggml_vk_test_transfer(ctx, 8192 * 1000, false);
|
| 5309 |
-
ggml_vk_test_transfer(ctx, 8192 * 1000, true);
|
| 5310 |
-
|
| 5311 |
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
|
| 5312 |
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0);
|
| 5313 |
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1);
|
|
@@ -5319,85 +5270,90 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
| 5319 |
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_K);
|
| 5320 |
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K);
|
| 5321 |
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K);
|
|
|
|
| 5322 |
|
| 5323 |
ggml_vk_test_matmul<ggml_fp16_t, ggml_fp16_t>(ctx, 512, 512, 100, 32, 100, 1, 2);
|
| 5324 |
|
| 5325 |
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 0);
|
| 5326 |
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 1);
|
| 5327 |
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 2);
|
| 5328 |
-
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0);
|
| 5329 |
-
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1);
|
| 5330 |
-
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2);
|
| 5331 |
|
| 5332 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_0);
|
| 5333 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_0);
|
| 5334 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_0);
|
| 5335 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0);
|
| 5336 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0);
|
| 5337 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0);
|
| 5338 |
|
| 5339 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_1);
|
| 5340 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_1);
|
| 5341 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_1);
|
| 5342 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1);
|
| 5343 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1);
|
| 5344 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1);
|
| 5345 |
|
| 5346 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_0);
|
| 5347 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_0);
|
| 5348 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_0);
|
| 5349 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0);
|
| 5350 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0);
|
| 5351 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0);
|
| 5352 |
|
| 5353 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_1);
|
| 5354 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_1);
|
| 5355 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_1);
|
| 5356 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1);
|
| 5357 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1);
|
| 5358 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1);
|
| 5359 |
|
| 5360 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q8_0);
|
| 5361 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q8_0);
|
| 5362 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q8_0);
|
| 5363 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0);
|
| 5364 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
|
| 5365 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
|
| 5366 |
|
| 5367 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
|
| 5368 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
|
| 5369 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
|
| 5370 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
|
| 5371 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
|
| 5372 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
|
| 5373 |
|
| 5374 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
|
| 5375 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
|
| 5376 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
|
| 5377 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
|
| 5378 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
|
| 5379 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
|
| 5380 |
|
| 5381 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
|
| 5382 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
|
| 5383 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
|
| 5384 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
|
| 5385 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
|
| 5386 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
|
| 5387 |
|
| 5388 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
|
| 5389 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
|
| 5390 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
|
| 5391 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
|
| 5392 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
|
| 5393 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
|
| 5394 |
|
| 5395 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
|
| 5396 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
|
| 5397 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
|
| 5398 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
|
| 5399 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
|
| 5400 |
-
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5401 |
|
| 5402 |
std::cerr << std::endl;
|
| 5403 |
|
|
@@ -5429,9 +5385,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
| 5429 |
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0);
|
| 5430 |
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1);
|
| 5431 |
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2);
|
| 5432 |
-
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
|
| 5433 |
-
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
|
| 5434 |
-
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
|
| 5435 |
std::cerr << std::endl;
|
| 5436 |
}
|
| 5437 |
|
|
@@ -6263,6 +6219,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
| 6263 |
case GGML_TYPE_Q4_K:
|
| 6264 |
case GGML_TYPE_Q5_K:
|
| 6265 |
case GGML_TYPE_Q6_K:
|
|
|
|
| 6266 |
break;
|
| 6267 |
default:
|
| 6268 |
return false;
|
|
@@ -6291,6 +6248,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
| 6291 |
case GGML_TYPE_Q5_0:
|
| 6292 |
case GGML_TYPE_Q5_1:
|
| 6293 |
case GGML_TYPE_Q8_0:
|
|
|
|
| 6294 |
return true;
|
| 6295 |
default:
|
| 6296 |
return false;
|
|
|
|
| 38 |
#define VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI 1
|
| 39 |
#define VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE 2
|
| 40 |
|
|
|
|
|
|
|
| 41 |
#define GGML_VK_MAX_NODES 8192
|
| 42 |
|
| 43 |
#define MAX_VK_BUFFERS 256
|
|
|
|
| 160 |
vk_matmul_pipeline pipeline_matmul_f16_f32;
|
| 161 |
vk_pipeline pipeline_matmul_split_k_reduce;
|
| 162 |
|
| 163 |
+
vk_matmul_pipeline pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];
|
| 164 |
|
| 165 |
vk_matmul_pipeline pipeline_matmul_id_f32;
|
| 166 |
vk_matmul_pipeline pipeline_matmul_id_f16;
|
| 167 |
vk_matmul_pipeline pipeline_matmul_id_f16_f32;
|
| 168 |
|
| 169 |
+
vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT];
|
| 170 |
|
| 171 |
+
vk_pipeline pipeline_dequant[GGML_TYPE_COUNT];
|
| 172 |
+
vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT];
|
| 173 |
+
vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT];
|
| 174 |
+
vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_COUNT];
|
| 175 |
|
| 176 |
vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
|
| 177 |
vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
|
| 178 |
+
vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
|
| 179 |
+
vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
|
| 180 |
vk_pipeline pipeline_mul_f32;
|
| 181 |
vk_pipeline pipeline_div_f32;
|
| 182 |
vk_pipeline pipeline_add_f32;
|
|
|
|
| 1057 |
);
|
| 1058 |
}
|
| 1059 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1060 |
static void ggml_vk_load_shaders(vk_device& device) {
|
| 1061 |
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
|
| 1062 |
|
|
|
|
| 1091 |
device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1092 |
device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1093 |
device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1094 |
+
device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1095 |
|
| 1096 |
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1097 |
device->pipeline_matmul_id_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
|
|
|
| 1106 |
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1107 |
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1108 |
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1109 |
+
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1110 |
|
| 1111 |
if (device->fp16) {
|
| 1112 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
|
|
|
| 1207 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1208 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1209 |
|
| 1210 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->l, "matmul_iq4_nl_f32_l", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1211 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->m, "matmul_iq4_nl_f32_m", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1212 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->s, "matmul_iq4_nl_f32_s", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1213 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_l, "matmul_iq4_nl_f32_aligned_l", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1214 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_m, "matmul_iq4_nl_f32_aligned_m", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1215 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_s, "matmul_iq4_nl_f32_aligned_s", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1216 |
+
|
| 1217 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
| 1218 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
| 1219 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
|
|
|
| 1304 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1305 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1306 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1307 |
+
|
| 1308 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->l, "matmul_id_iq4_nl_f32_l", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1309 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->m, "matmul_id_iq4_nl_f32_m", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1310 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->s, "matmul_id_iq4_nl_f32_s", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1311 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_l, "matmul_id_iq4_nl_f32_aligned_l", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1312 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_m, "matmul_id_iq4_nl_f32_aligned_m", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1313 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_s, "matmul_id_iq4_nl_f32_aligned_s", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1314 |
} else {
|
| 1315 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
| 1316 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
|
|
|
| 1410 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1411 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1412 |
|
| 1413 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->l, "matmul_iq4_nl_f32_l", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1414 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->m, "matmul_iq4_nl_f32_m", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1415 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->s, "matmul_iq4_nl_f32_s", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1416 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_l, "matmul_iq4_nl_f32_aligned_l", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1417 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_m, "matmul_iq4_nl_f32_aligned_m", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1418 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_s, "matmul_iq4_nl_f32_aligned_s", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1419 |
+
|
| 1420 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
| 1421 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
| 1422 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
|
|
|
| 1507 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1508 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1509 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1510 |
+
|
| 1511 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->l, "matmul_id_iq4_nl_f32_l", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1512 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->m, "matmul_id_iq4_nl_f32_m", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1513 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->s, "matmul_id_iq4_nl_f32_s", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1514 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_l, "matmul_id_iq4_nl_f32_aligned_l", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1515 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_m, "matmul_id_iq4_nl_f32_aligned_m", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1516 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_s, "matmul_id_iq4_nl_f32_aligned_s", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1517 |
}
|
| 1518 |
|
| 1519 |
// mul mat vec
|
|
|
|
| 1529 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1530 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1531 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1532 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1533 |
|
| 1534 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1535 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
|
|
| 1543 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1544 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1545 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1546 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1547 |
|
| 1548 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1549 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
|
|
| 1557 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1558 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1559 |
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1560 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
| 1561 |
|
| 1562 |
// dequant shaders
|
| 1563 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
|
|
|
| 1571 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
| 1572 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
| 1573 |
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
| 1574 |
+
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL], "dequant_iq4_nl", dequant_iq4_nl_len, dequant_iq4_nl_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
| 1575 |
|
| 1576 |
// get_rows
|
| 1577 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
|
|
|
| 1581 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1582 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1583 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1584 |
+
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1585 |
|
| 1586 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
| 1587 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
|
|
|
| 1590 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1591 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1592 |
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1593 |
+
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1594 |
|
| 1595 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
|
| 1596 |
|
|
|
|
| 2102 |
case GGML_TYPE_Q4_K:
|
| 2103 |
case GGML_TYPE_Q5_K:
|
| 2104 |
case GGML_TYPE_Q6_K:
|
| 2105 |
+
case GGML_TYPE_IQ4_NL:
|
| 2106 |
break;
|
| 2107 |
default:
|
| 2108 |
return nullptr;
|
|
|
|
| 2139 |
case GGML_TYPE_Q4_K:
|
| 2140 |
case GGML_TYPE_Q5_K:
|
| 2141 |
case GGML_TYPE_Q6_K:
|
| 2142 |
+
case GGML_TYPE_IQ4_NL:
|
| 2143 |
break;
|
| 2144 |
default:
|
| 2145 |
return nullptr;
|
|
|
|
| 2165 |
case GGML_TYPE_Q4_K:
|
| 2166 |
case GGML_TYPE_Q5_K:
|
| 2167 |
case GGML_TYPE_Q6_K:
|
| 2168 |
+
case GGML_TYPE_IQ4_NL:
|
| 2169 |
break;
|
| 2170 |
default:
|
| 2171 |
return nullptr;
|
|
|
|
| 2199 |
case GGML_TYPE_Q4_K:
|
| 2200 |
case GGML_TYPE_Q5_K:
|
| 2201 |
case GGML_TYPE_Q6_K:
|
| 2202 |
+
case GGML_TYPE_IQ4_NL:
|
| 2203 |
break;
|
| 2204 |
default:
|
| 2205 |
return nullptr;
|
|
|
|
| 2225 |
case GGML_TYPE_Q4_K:
|
| 2226 |
case GGML_TYPE_Q5_K:
|
| 2227 |
case GGML_TYPE_Q6_K:
|
| 2228 |
+
case GGML_TYPE_IQ4_NL:
|
| 2229 |
break;
|
| 2230 |
default:
|
| 2231 |
return nullptr;
|
|
|
|
| 3451 |
|
| 3452 |
const uint64_t nei0 = ids->ne[0];
|
| 3453 |
const uint64_t nei1 = ids->ne[1];
|
| 3454 |
+
GGML_ASSERT(nei0 * nei1 <= 3072);
|
| 3455 |
|
| 3456 |
const uint32_t nbi1 = ids->nb[1];
|
| 3457 |
const uint32_t nbi2 = ids->nb[2];
|
|
|
|
| 3463 |
|
| 3464 |
const uint64_t n_as = ne02;
|
| 3465 |
|
|
|
|
|
|
|
| 3466 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
| 3467 |
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
| 3468 |
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
|
|
|
| 4641 |
}
|
| 4642 |
}
|
| 4643 |
|
| 4644 |
+
ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it);
|
| 4645 |
if (split_k > 1) {
|
| 4646 |
+
ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
| 4647 |
|
| 4648 |
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
| 4649 |
// Resize buffer
|
| 4650 |
if (ctx->prealloc_split_k != nullptr) {
|
| 4651 |
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
| 4652 |
}
|
| 4653 |
+
ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 4654 |
}
|
| 4655 |
}
|
| 4656 |
|
| 4657 |
+
vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 4658 |
+
vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 4659 |
+
vk_buffer d_D = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 4660 |
|
| 4661 |
X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne);
|
| 4662 |
Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne);
|
|
|
|
| 4683 |
}
|
| 4684 |
}
|
| 4685 |
|
| 4686 |
+
ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
|
| 4687 |
+
ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
|
| 4688 |
|
| 4689 |
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
| 4690 |
for (size_t i = 0; i < num_it; i++) {
|
| 4691 |
+
ggml_vk_ctx_begin(ctx->device, subctx);
|
| 4692 |
ggml_vk_matmul(
|
| 4693 |
ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k),
|
| 4694 |
m, n, k,
|
|
|
|
| 4707 |
double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
| 4708 |
|
| 4709 |
// copy dst to host
|
| 4710 |
+
ggml_vk_buffer_read(d_D, 0, d, sizeof(float) * d_ne);
|
| 4711 |
|
| 4712 |
float * d_chk = (float *) malloc(sizeof(float) * d_ne);
|
| 4713 |
|
|
|
|
| 4783 |
|
| 4784 |
if (split_k > 1) {
|
| 4785 |
float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
|
| 4786 |
+
ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
|
| 4787 |
|
| 4788 |
std::cerr << "d_buf0: " << std::endl << std::endl;
|
| 4789 |
ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
|
|
|
| 4803 |
|
| 4804 |
free(d_chk);
|
| 4805 |
|
| 4806 |
+
ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
|
| 4807 |
+
ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
|
| 4808 |
|
| 4809 |
ggml_vk_destroy_buffer(d_X);
|
| 4810 |
ggml_vk_destroy_buffer(d_Y);
|
|
|
|
| 4852 |
}
|
| 4853 |
}
|
| 4854 |
|
| 4855 |
+
static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
|
| 4856 |
+
ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
|
| 4857 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4858 |
|
| 4859 |
+
static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant) {
|
| 4860 |
+
if (quant == GGML_TYPE_F32) {
|
| 4861 |
+
memcpy(to, from, sizeof(float) * ne);
|
| 4862 |
+
return;
|
| 4863 |
}
|
| 4864 |
|
| 4865 |
+
ggml_type_traits_t tt = ggml_internal_get_type_traits(quant);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4866 |
|
| 4867 |
+
ggml_to_float_t dequant_fn = tt.to_float;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4868 |
|
| 4869 |
+
dequant_fn(from, to, ne);
|
|
|
|
| 4870 |
}
|
| 4871 |
|
| 4872 |
static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
|
|
|
|
| 4876 |
const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
|
| 4877 |
float * x = (float *) malloc(x_sz);
|
| 4878 |
void * qx = malloc(qx_sz);
|
| 4879 |
+
vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 4880 |
+
vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 4881 |
+
float * x_ref = (float *) malloc(x_sz);
|
| 4882 |
ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16);
|
| 4883 |
|
| 4884 |
for (size_t i = 0; i < ne; i++) {
|
| 4885 |
x[i] = rand() / (float)RAND_MAX;
|
| 4886 |
}
|
| 4887 |
|
| 4888 |
+
vk_pipeline p = ggml_vk_get_to_fp16(ctx, quant);
|
| 4889 |
|
| 4890 |
ggml_vk_quantize_data(x, qx, ne, quant);
|
| 4891 |
+
ggml_vk_dequantize_data(qx, x_ref, ne, quant);
|
| 4892 |
|
| 4893 |
+
ggml_pipeline_allocate_descriptor_sets(ctx->device, p, 1);
|
| 4894 |
|
| 4895 |
+
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
| 4896 |
|
| 4897 |
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
| 4898 |
+
ggml_vk_ctx_begin(ctx->device, subctx);
|
| 4899 |
const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
|
| 4900 |
ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
|
| 4901 |
ggml_vk_ctx_end(subctx);
|
|
|
|
| 4909 |
auto end = std::chrono::high_resolution_clock::now();
|
| 4910 |
|
| 4911 |
double ms_dequant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
| 4912 |
+
ggml_vk_buffer_read(x_buf, 0, x_chk, x_sz_f16);
|
| 4913 |
|
| 4914 |
int first_err = -1;
|
| 4915 |
|
| 4916 |
double avg_err = 0.0;
|
| 4917 |
for (size_t i = 0; i < ne; i++) {
|
| 4918 |
+
double error = std::fabs(x_ref[i] - ggml_fp16_to_fp32(x_chk[i]));
|
| 4919 |
avg_err += error;
|
| 4920 |
|
| 4921 |
if (first_err < 0 && error > 0.05) {
|
|
|
|
| 4935 |
}
|
| 4936 |
std::cerr << std::endl << "Expected result: " << std::endl << std::endl;
|
| 4937 |
for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) {
|
| 4938 |
+
std::cerr << x_ref[i] << ", ";
|
| 4939 |
}
|
| 4940 |
std::cerr << std::endl;
|
| 4941 |
}
|
|
|
|
| 4945 |
|
| 4946 |
free(x);
|
| 4947 |
free(qx);
|
| 4948 |
+
free(x_ref);
|
| 4949 |
free(x_chk);
|
| 4950 |
}
|
| 4951 |
|
|
|
|
| 4994 |
float * x = (float *) malloc(x_sz);
|
| 4995 |
float * y = (float *) malloc(y_sz);
|
| 4996 |
void * qx = malloc(qx_sz);
|
| 4997 |
+
vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 4998 |
+
vk_buffer y_buf = ggml_vk_create_buffer_check(ctx->device, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 4999 |
+
vk_buffer d_buf = ggml_vk_create_buffer_check(ctx->device, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 5000 |
float * d = (float *) malloc(d_sz);
|
| 5001 |
float * d_chk = (float *) malloc(d_sz);
|
| 5002 |
|
|
|
|
| 5011 |
y[i] = (i % k == i / k) ? 1.0f : 0.0f;
|
| 5012 |
}
|
| 5013 |
|
| 5014 |
+
ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it);
|
| 5015 |
if (split_k > 1) {
|
| 5016 |
+
ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
| 5017 |
|
| 5018 |
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
| 5019 |
// Resize buffer
|
| 5020 |
if (ctx->prealloc_split_k != nullptr) {
|
| 5021 |
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
| 5022 |
}
|
| 5023 |
+
ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
| 5024 |
}
|
| 5025 |
}
|
| 5026 |
|
| 5027 |
+
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
| 5028 |
+
ggml_vk_buffer_write(y_buf, 0, y, y_sz);
|
| 5029 |
|
| 5030 |
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
| 5031 |
for (size_t i = 0; i < num_it; i++) {
|
| 5032 |
+
ggml_vk_ctx_begin(ctx->device, subctx);
|
| 5033 |
ggml_vk_matmul(
|
| 5034 |
ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k),
|
| 5035 |
m, n, k,
|
|
|
|
| 5048 |
auto end = std::chrono::high_resolution_clock::now();
|
| 5049 |
|
| 5050 |
double time_ms = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
| 5051 |
+
ggml_vk_buffer_read(d_buf, 0, d, d_sz);
|
| 5052 |
|
| 5053 |
ggml_init_params iparams = {
|
| 5054 |
/*.mem_size =*/ 1024*1024*1024,
|
|
|
|
| 5103 |
|
| 5104 |
if (split_k > 1) {
|
| 5105 |
float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
|
| 5106 |
+
ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
|
| 5107 |
|
| 5108 |
std::cerr << "d_buf0: " << std::endl << std::endl;
|
| 5109 |
ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
|
|
|
| 5256 |
|
| 5257 |
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
| 5258 |
#if defined(GGML_VULKAN_RUN_TESTS)
|
| 5259 |
+
ctx->staging = ggml_vk_create_buffer_check(ctx->device, 100ul * 1024ul * 1024ul,
|
| 5260 |
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
| 5261 |
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
|
|
|
|
|
|
|
|
|
| 5262 |
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
|
| 5263 |
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0);
|
| 5264 |
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1);
|
|
|
|
| 5270 |
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_K);
|
| 5271 |
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K);
|
| 5272 |
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K);
|
| 5273 |
+
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_IQ4_NL);
|
| 5274 |
|
| 5275 |
ggml_vk_test_matmul<ggml_fp16_t, ggml_fp16_t>(ctx, 512, 512, 100, 32, 100, 1, 2);
|
| 5276 |
|
| 5277 |
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 0);
|
| 5278 |
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 1);
|
| 5279 |
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 2);
|
| 5280 |
+
// ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0);
|
| 5281 |
+
// ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1);
|
| 5282 |
+
// ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2);
|
| 5283 |
|
| 5284 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_0);
|
| 5285 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_0);
|
| 5286 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_0);
|
| 5287 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0);
|
| 5288 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0);
|
| 5289 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0);
|
| 5290 |
|
| 5291 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_1);
|
| 5292 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_1);
|
| 5293 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_1);
|
| 5294 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1);
|
| 5295 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1);
|
| 5296 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1);
|
| 5297 |
|
| 5298 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_0);
|
| 5299 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_0);
|
| 5300 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_0);
|
| 5301 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0);
|
| 5302 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0);
|
| 5303 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0);
|
| 5304 |
|
| 5305 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_1);
|
| 5306 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_1);
|
| 5307 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_1);
|
| 5308 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1);
|
| 5309 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1);
|
| 5310 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1);
|
| 5311 |
|
| 5312 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q8_0);
|
| 5313 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q8_0);
|
| 5314 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q8_0);
|
| 5315 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0);
|
| 5316 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
|
| 5317 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
|
| 5318 |
|
| 5319 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
|
| 5320 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
|
| 5321 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
|
| 5322 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
|
| 5323 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
|
| 5324 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
|
| 5325 |
|
| 5326 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
|
| 5327 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
|
| 5328 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
|
| 5329 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
|
| 5330 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
|
| 5331 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
|
| 5332 |
|
| 5333 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
|
| 5334 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
|
| 5335 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
|
| 5336 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
|
| 5337 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
|
| 5338 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
|
| 5339 |
|
| 5340 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
|
| 5341 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
|
| 5342 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
|
| 5343 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
|
| 5344 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
|
| 5345 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
|
| 5346 |
|
| 5347 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
|
| 5348 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
|
| 5349 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
|
| 5350 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
|
| 5351 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
|
| 5352 |
+
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
|
| 5353 |
+
|
| 5354 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_IQ4_NL);
|
| 5355 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_IQ4_NL);
|
| 5356 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_IQ4_NL);
|
| 5357 |
|
| 5358 |
std::cerr << std::endl;
|
| 5359 |
|
|
|
|
| 5385 |
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0);
|
| 5386 |
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1);
|
| 5387 |
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2);
|
| 5388 |
+
// ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
|
| 5389 |
+
// ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
|
| 5390 |
+
// ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
|
| 5391 |
std::cerr << std::endl;
|
| 5392 |
}
|
| 5393 |
|
|
|
|
| 6219 |
case GGML_TYPE_Q4_K:
|
| 6220 |
case GGML_TYPE_Q5_K:
|
| 6221 |
case GGML_TYPE_Q6_K:
|
| 6222 |
+
case GGML_TYPE_IQ4_NL:
|
| 6223 |
break;
|
| 6224 |
default:
|
| 6225 |
return false;
|
|
|
|
| 6248 |
case GGML_TYPE_Q5_0:
|
| 6249 |
case GGML_TYPE_Q5_1:
|
| 6250 |
case GGML_TYPE_Q8_0:
|
| 6251 |
+
case GGML_TYPE_IQ4_NL:
|
| 6252 |
return true;
|
| 6253 |
default:
|
| 6254 |
return false;
|