Spaces:
Running
Running
Jinyang He
commited on
Commit
·
b82d241
1
Parent(s):
7e1dbe9
ggml : optimize and build warning fix for LoongArch (llama/11709)
Browse files* ggml : optimize convert f32<->f16 for loongarch_asx
* ggml : optimize loongarch_asx extend i16,i8,u8 to i32,i16
* ggml : Fix warnings when run cpu CI locally on LoongArch
ggml/src/ggml-cpu/ggml-cpu-impl.h
CHANGED
|
@@ -360,21 +360,15 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
|
|
| 360 |
#endif
|
| 361 |
|
| 362 |
#if defined(__loongarch_asx)
|
| 363 |
-
|
| 364 |
-
typedef union {
|
| 365 |
-
int32_t i;
|
| 366 |
-
float f;
|
| 367 |
-
} ft_union;
|
| 368 |
-
|
| 369 |
/* float type data load instructions */
|
| 370 |
-
static __m128 __lsx_vreplfr2vr_s(float val) {
|
| 371 |
-
|
| 372 |
-
return (__m128)
|
| 373 |
}
|
| 374 |
|
| 375 |
-
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
| 376 |
-
|
| 377 |
-
return (__m256)
|
| 378 |
}
|
| 379 |
#endif
|
| 380 |
|
|
|
|
| 360 |
#endif
|
| 361 |
|
| 362 |
#if defined(__loongarch_asx)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
/* float type data load instructions */
|
| 364 |
+
static __m128 __lsx_vreplfr2vr_s(const float val) {
|
| 365 |
+
v4f32 res = {val, val, val, val};
|
| 366 |
+
return (__m128)res;
|
| 367 |
}
|
| 368 |
|
| 369 |
+
static __m256 __lasx_xvreplfr2vr_s(const float val) {
|
| 370 |
+
v8f32 res = {val, val, val, val, val, val, val, val};
|
| 371 |
+
return (__m256)res;
|
| 372 |
}
|
| 373 |
#endif
|
| 374 |
|
ggml/src/ggml-cpu/ggml-cpu-quants.c
CHANGED
|
@@ -501,30 +501,15 @@ static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
|
|
| 501 |
}
|
| 502 |
|
| 503 |
static __m256i lasx_extu8_16(__m128i a) {
|
| 504 |
-
|
| 505 |
-
__m128i vlo = __lsx_vilvl_b(zero, a);
|
| 506 |
-
__m128i vhi = __lsx_vilvh_b(zero, a);
|
| 507 |
-
return lasx_set_q(vhi, vlo);
|
| 508 |
}
|
| 509 |
|
| 510 |
static __m256i lasx_ext8_16(__m128i a) {
|
| 511 |
-
|
| 512 |
-
__m128i vlo = __lsx_vilvl_b(sign, a);
|
| 513 |
-
__m128i vhi = __lsx_vilvh_b(sign, a);
|
| 514 |
-
return lasx_set_q(vhi, vlo);
|
| 515 |
}
|
| 516 |
|
| 517 |
static __m256i lasx_ext16_32(__m128i a) {
|
| 518 |
-
|
| 519 |
-
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 0), 0);
|
| 520 |
-
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 1), 1);
|
| 521 |
-
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 2), 2);
|
| 522 |
-
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 3), 3);
|
| 523 |
-
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 4), 4);
|
| 524 |
-
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 5), 5);
|
| 525 |
-
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 6), 6);
|
| 526 |
-
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 7), 7);
|
| 527 |
-
return tmp1;
|
| 528 |
}
|
| 529 |
|
| 530 |
static __m128i lasx_extracti128( __m256i a, int pos) {
|
|
@@ -592,12 +577,10 @@ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
|
| 592 |
// horizontally add 8 floats
|
| 593 |
static inline float hsum_float_8(const __m256 x) {
|
| 594 |
__m128 res = lasx_extractf128(x, 1);
|
| 595 |
-
ft_union tmp;
|
| 596 |
res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
|
| 597 |
res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
|
| 598 |
res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
|
| 599 |
-
|
| 600 |
-
return tmp.f;
|
| 601 |
}
|
| 602 |
|
| 603 |
// horizontally add 8 int32_t
|
|
@@ -939,7 +922,6 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
|
| 939 |
|
| 940 |
#elif defined(__loongarch_asx)
|
| 941 |
for (int i = 0; i < nb; i++) {
|
| 942 |
-
ft_union fi;
|
| 943 |
__m256 v0 = (__m256)__lasx_xvld( x , 0);
|
| 944 |
__m256 v1 = (__m256)__lasx_xvld( x , 32);
|
| 945 |
__m256 v2 = (__m256)__lasx_xvld( x , 64);
|
|
@@ -957,8 +939,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
|
| 957 |
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
|
| 958 |
__m128 tmp = max4;
|
| 959 |
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
|
| 960 |
-
|
| 961 |
-
const float max_scalar = fi.f;
|
| 962 |
|
| 963 |
// Quantize these floats
|
| 964 |
const float d = max_scalar / 127.f;
|
|
@@ -1263,7 +1244,6 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
|
|
| 1263 |
|
| 1264 |
#elif defined(__loongarch_asx)
|
| 1265 |
for (int i = 0; i < nb; i++) {
|
| 1266 |
-
ft_union ft;
|
| 1267 |
__m256 v0 = (__m256)__lasx_xvld( x , 0 );
|
| 1268 |
__m256 v1 = (__m256)__lasx_xvld( x , 32 );
|
| 1269 |
__m256 v2 = (__m256)__lasx_xvld( x , 64 );
|
|
@@ -1281,8 +1261,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
|
|
| 1281 |
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
|
| 1282 |
__m128 tmp = max4;
|
| 1283 |
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
|
| 1284 |
-
|
| 1285 |
-
const float max_scalar = ft.f;
|
| 1286 |
|
| 1287 |
// Quantize these floats
|
| 1288 |
const float d = max_scalar / 127.f;
|
|
@@ -6154,9 +6133,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 6154 |
acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
|
| 6155 |
|
| 6156 |
|
| 6157 |
-
|
| 6158 |
-
fi.i = __lsx_vpickve2gr_w(acc_m, 0);
|
| 6159 |
-
*s = hsum_float_8(acc) + fi.f ;
|
| 6160 |
#else
|
| 6161 |
|
| 6162 |
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
|
|
|
| 501 |
}
|
| 502 |
|
| 503 |
static __m256i lasx_extu8_16(__m128i a) {
|
| 504 |
+
return __lasx_vext2xv_hu_bu(____m256i(a));
|
|
|
|
|
|
|
|
|
|
| 505 |
}
|
| 506 |
|
| 507 |
static __m256i lasx_ext8_16(__m128i a) {
|
| 508 |
+
return __lasx_vext2xv_h_b(____m256i(a));
|
|
|
|
|
|
|
|
|
|
| 509 |
}
|
| 510 |
|
| 511 |
static __m256i lasx_ext16_32(__m128i a) {
|
| 512 |
+
return __lasx_vext2xv_w_h(____m256i(a));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
}
|
| 514 |
|
| 515 |
static __m128i lasx_extracti128( __m256i a, int pos) {
|
|
|
|
| 577 |
// horizontally add 8 floats
|
| 578 |
static inline float hsum_float_8(const __m256 x) {
|
| 579 |
__m128 res = lasx_extractf128(x, 1);
|
|
|
|
| 580 |
res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
|
| 581 |
res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
|
| 582 |
res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
|
| 583 |
+
return ((v4f32)res)[0];
|
|
|
|
| 584 |
}
|
| 585 |
|
| 586 |
// horizontally add 8 int32_t
|
|
|
|
| 922 |
|
| 923 |
#elif defined(__loongarch_asx)
|
| 924 |
for (int i = 0; i < nb; i++) {
|
|
|
|
| 925 |
__m256 v0 = (__m256)__lasx_xvld( x , 0);
|
| 926 |
__m256 v1 = (__m256)__lasx_xvld( x , 32);
|
| 927 |
__m256 v2 = (__m256)__lasx_xvld( x , 64);
|
|
|
|
| 939 |
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
|
| 940 |
__m128 tmp = max4;
|
| 941 |
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
|
| 942 |
+
const float max_scalar = ((v4f32)max4)[0];
|
|
|
|
| 943 |
|
| 944 |
// Quantize these floats
|
| 945 |
const float d = max_scalar / 127.f;
|
|
|
|
| 1244 |
|
| 1245 |
#elif defined(__loongarch_asx)
|
| 1246 |
for (int i = 0; i < nb; i++) {
|
|
|
|
| 1247 |
__m256 v0 = (__m256)__lasx_xvld( x , 0 );
|
| 1248 |
__m256 v1 = (__m256)__lasx_xvld( x , 32 );
|
| 1249 |
__m256 v2 = (__m256)__lasx_xvld( x , 64 );
|
|
|
|
| 1261 |
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
|
| 1262 |
__m128 tmp = max4;
|
| 1263 |
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
|
| 1264 |
+
const float max_scalar = ((v4f32)max4)[0];
|
|
|
|
| 1265 |
|
| 1266 |
// Quantize these floats
|
| 1267 |
const float d = max_scalar / 127.f;
|
|
|
|
| 6133 |
acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
|
| 6134 |
|
| 6135 |
|
| 6136 |
+
*s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
|
|
|
|
|
|
|
| 6137 |
#else
|
| 6138 |
|
| 6139 |
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
ggml/src/ggml-cpu/ggml-cpu.c
CHANGED
|
@@ -1078,29 +1078,23 @@ do { \
|
|
| 1078 |
#define GGML_F16_STEP 32
|
| 1079 |
#define GGML_F16_EPR 8
|
| 1080 |
|
| 1081 |
-
// F16 arithmetic is not supported by
|
| 1082 |
|
| 1083 |
#define GGML_F32Cx8 __m256
|
| 1084 |
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
| 1085 |
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
| 1086 |
|
| 1087 |
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
|
| 1088 |
-
|
| 1089 |
-
|
| 1090 |
-
|
| 1091 |
-
|
| 1092 |
-
}
|
| 1093 |
-
|
| 1094 |
-
return (__m256)__lasx_xvld(tmp, 0);
|
| 1095 |
}
|
| 1096 |
-
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
| 1097 |
-
float arr[8];
|
| 1098 |
|
| 1099 |
-
|
| 1100 |
-
|
| 1101 |
-
|
| 1102 |
-
|
| 1103 |
-
}
|
| 1104 |
}
|
| 1105 |
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
| 1106 |
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
|
|
|
| 1078 |
#define GGML_F16_STEP 32
|
| 1079 |
#define GGML_F16_EPR 8
|
| 1080 |
|
| 1081 |
+
// F16 arithmetic is not supported by LASX, so we use F32 instead
|
| 1082 |
|
| 1083 |
#define GGML_F32Cx8 __m256
|
| 1084 |
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
| 1085 |
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
| 1086 |
|
| 1087 |
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
|
| 1088 |
+
__m256i a;
|
| 1089 |
+
memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
|
| 1090 |
+
a = __lasx_xvpermi_d(a, 0 | (1 << 4));
|
| 1091 |
+
return __lasx_xvfcvtl_s_h(a);
|
|
|
|
|
|
|
|
|
|
| 1092 |
}
|
|
|
|
|
|
|
| 1093 |
|
| 1094 |
+
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
| 1095 |
+
__m256i a = __lasx_xvfcvt_h_s(y, y);
|
| 1096 |
+
a = __lasx_xvpermi_d(a, 0 | (2 << 2));
|
| 1097 |
+
memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
|
|
|
|
| 1098 |
}
|
| 1099 |
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
| 1100 |
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|