ggerganov HF Staff commited on
Commit
cb8bbaa
·
unverified ·
1 Parent(s): 11a2545

ggml : fix unnecessary f32 -> f16 -> f32 casts (mmla) (llama/5951)

Browse files
Files changed (1) hide show
  1. ggml-quants.c +4 -4
ggml-quants.c CHANGED
@@ -4059,10 +4059,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4059
  const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
4060
 
4061
  // mmla into int32x4_t
4062
- float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
4063
- GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
4064
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
4065
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
4066
 
4067
  int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
4068
  int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
 
4059
  const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
4060
 
4061
  // mmla into int32x4_t
4062
+ float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
4063
+ GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
4064
+ GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
4065
+ GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
4066
 
4067
  int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
4068
  int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));