Jinyang He commited on
Commit
b82d241
·
1 Parent(s): 7e1dbe9

ggml : optimize and build warning fix for LoongArch (llama/11709)

Browse files

* ggml : optimize convert f32<->f16 for loongarch_asx

* ggml : optimize loongarch_asx extend i16,i8,u8 to i32,i16

* ggml : Fix warnings when run cpu CI locally on LoongArch

ggml/src/ggml-cpu/ggml-cpu-impl.h CHANGED
@@ -360,21 +360,15 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
360
  #endif
361
 
362
  #if defined(__loongarch_asx)
363
-
364
- typedef union {
365
- int32_t i;
366
- float f;
367
- } ft_union;
368
-
369
  /* float type data load instructions */
370
- static __m128 __lsx_vreplfr2vr_s(float val) {
371
- ft_union fi_tmpval = {.f = val};
372
- return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
373
  }
374
 
375
- static __m256 __lasx_xvreplfr2vr_s(float val) {
376
- ft_union fi_tmpval = {.f = val};
377
- return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
378
  }
379
  #endif
380
 
 
360
  #endif
361
 
362
  #if defined(__loongarch_asx)
 
 
 
 
 
 
363
  /* float type data load instructions */
364
+ static __m128 __lsx_vreplfr2vr_s(const float val) {
365
+ v4f32 res = {val, val, val, val};
366
+ return (__m128)res;
367
  }
368
 
369
+ static __m256 __lasx_xvreplfr2vr_s(const float val) {
370
+ v8f32 res = {val, val, val, val, val, val, val, val};
371
+ return (__m256)res;
372
  }
373
  #endif
374
 
ggml/src/ggml-cpu/ggml-cpu-quants.c CHANGED
@@ -501,30 +501,15 @@ static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
501
  }
502
 
503
  static __m256i lasx_extu8_16(__m128i a) {
504
- __m128i zero = __lsx_vldi(0);
505
- __m128i vlo = __lsx_vilvl_b(zero, a);
506
- __m128i vhi = __lsx_vilvh_b(zero, a);
507
- return lasx_set_q(vhi, vlo);
508
  }
509
 
510
  static __m256i lasx_ext8_16(__m128i a) {
511
- __m128i sign = __lsx_vslti_b(a, 0);
512
- __m128i vlo = __lsx_vilvl_b(sign, a);
513
- __m128i vhi = __lsx_vilvh_b(sign, a);
514
- return lasx_set_q(vhi, vlo);
515
  }
516
 
517
  static __m256i lasx_ext16_32(__m128i a) {
518
- __m256i tmp1;
519
- tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 0), 0);
520
- tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 1), 1);
521
- tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 2), 2);
522
- tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 3), 3);
523
- tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 4), 4);
524
- tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 5), 5);
525
- tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 6), 6);
526
- tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 7), 7);
527
- return tmp1;
528
  }
529
 
530
  static __m128i lasx_extracti128( __m256i a, int pos) {
@@ -592,12 +577,10 @@ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
592
  // horizontally add 8 floats
593
  static inline float hsum_float_8(const __m256 x) {
594
  __m128 res = lasx_extractf128(x, 1);
595
- ft_union tmp;
596
  res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
597
  res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
598
  res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
599
- tmp.i = __lsx_vpickve2gr_w(res, 0);
600
- return tmp.f;
601
  }
602
 
603
  // horizontally add 8 int32_t
@@ -939,7 +922,6 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
939
 
940
  #elif defined(__loongarch_asx)
941
  for (int i = 0; i < nb; i++) {
942
- ft_union fi;
943
  __m256 v0 = (__m256)__lasx_xvld( x , 0);
944
  __m256 v1 = (__m256)__lasx_xvld( x , 32);
945
  __m256 v2 = (__m256)__lasx_xvld( x , 64);
@@ -957,8 +939,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
957
  max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
958
  __m128 tmp = max4;
959
  max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
960
- fi.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 );
961
- const float max_scalar = fi.f;
962
 
963
  // Quantize these floats
964
  const float d = max_scalar / 127.f;
@@ -1263,7 +1244,6 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
1263
 
1264
  #elif defined(__loongarch_asx)
1265
  for (int i = 0; i < nb; i++) {
1266
- ft_union ft;
1267
  __m256 v0 = (__m256)__lasx_xvld( x , 0 );
1268
  __m256 v1 = (__m256)__lasx_xvld( x , 32 );
1269
  __m256 v2 = (__m256)__lasx_xvld( x , 64 );
@@ -1281,8 +1261,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
1281
  max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
1282
  __m128 tmp = max4;
1283
  max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
1284
- ft.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 );
1285
- const float max_scalar = ft.f;
1286
 
1287
  // Quantize these floats
1288
  const float d = max_scalar / 127.f;
@@ -6154,9 +6133,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6154
  acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
6155
 
6156
 
6157
- ft_union fi;
6158
- fi.i = __lsx_vpickve2gr_w(acc_m, 0);
6159
- *s = hsum_float_8(acc) + fi.f ;
6160
  #else
6161
 
6162
  const uint8_t * scales = (const uint8_t*)&utmp[0];
 
501
  }
502
 
503
  static __m256i lasx_extu8_16(__m128i a) {
504
+ return __lasx_vext2xv_hu_bu(____m256i(a));
 
 
 
505
  }
506
 
507
  static __m256i lasx_ext8_16(__m128i a) {
508
+ return __lasx_vext2xv_h_b(____m256i(a));
 
 
 
509
  }
510
 
511
  static __m256i lasx_ext16_32(__m128i a) {
512
+ return __lasx_vext2xv_w_h(____m256i(a));
 
 
 
 
 
 
 
 
 
513
  }
514
 
515
  static __m128i lasx_extracti128( __m256i a, int pos) {
 
577
  // horizontally add 8 floats
578
  static inline float hsum_float_8(const __m256 x) {
579
  __m128 res = lasx_extractf128(x, 1);
 
580
  res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
581
  res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
582
  res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
583
+ return ((v4f32)res)[0];
 
584
  }
585
 
586
  // horizontally add 8 int32_t
 
922
 
923
  #elif defined(__loongarch_asx)
924
  for (int i = 0; i < nb; i++) {
 
925
  __m256 v0 = (__m256)__lasx_xvld( x , 0);
926
  __m256 v1 = (__m256)__lasx_xvld( x , 32);
927
  __m256 v2 = (__m256)__lasx_xvld( x , 64);
 
939
  max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
940
  __m128 tmp = max4;
941
  max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
942
+ const float max_scalar = ((v4f32)max4)[0];
 
943
 
944
  // Quantize these floats
945
  const float d = max_scalar / 127.f;
 
1244
 
1245
  #elif defined(__loongarch_asx)
1246
  for (int i = 0; i < nb; i++) {
 
1247
  __m256 v0 = (__m256)__lasx_xvld( x , 0 );
1248
  __m256 v1 = (__m256)__lasx_xvld( x , 32 );
1249
  __m256 v2 = (__m256)__lasx_xvld( x , 64 );
 
1261
  max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
1262
  __m128 tmp = max4;
1263
  max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
1264
+ const float max_scalar = ((v4f32)max4)[0];
 
1265
 
1266
  // Quantize these floats
1267
  const float d = max_scalar / 127.f;
 
6133
  acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
6134
 
6135
 
6136
+ *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
 
 
6137
  #else
6138
 
6139
  const uint8_t * scales = (const uint8_t*)&utmp[0];
ggml/src/ggml-cpu/ggml-cpu.c CHANGED
@@ -1078,29 +1078,23 @@ do { \
1078
  #define GGML_F16_STEP 32
1079
  #define GGML_F16_EPR 8
1080
 
1081
- // F16 arithmetic is not supported by AVX, so we use F32 instead
1082
 
1083
  #define GGML_F32Cx8 __m256
1084
  #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
1085
  #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
1086
 
1087
  static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
1088
- float tmp[8];
1089
-
1090
- for (int i = 0; i < 8; i++) {
1091
- tmp[i] = GGML_FP16_TO_FP32(x[i]);
1092
- }
1093
-
1094
- return (__m256)__lasx_xvld(tmp, 0);
1095
  }
1096
- static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
1097
- float arr[8];
1098
 
1099
- __lasx_xvst(y, arr, 0);
1100
-
1101
- for (int i = 0; i < 8; i++) {
1102
- x[i] = GGML_FP32_TO_FP16(arr[i]);
1103
- }
1104
  }
1105
  #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
1106
  #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
 
1078
  #define GGML_F16_STEP 32
1079
  #define GGML_F16_EPR 8
1080
 
1081
+ // F16 arithmetic is not supported by LASX, so we use F32 instead
1082
 
1083
  #define GGML_F32Cx8 __m256
1084
  #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
1085
  #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
1086
 
1087
  static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
1088
+ __m256i a;
1089
+ memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
1090
+ a = __lasx_xvpermi_d(a, 0 | (1 << 4));
1091
+ return __lasx_xvfcvtl_s_h(a);
 
 
 
1092
  }
 
 
1093
 
1094
+ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
1095
+ __m256i a = __lasx_xvfcvt_h_s(y, y);
1096
+ a = __lasx_xvpermi_d(a, 0 | (2 << 2));
1097
+ memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
 
1098
  }
1099
  #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
1100
  #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)