| #define _POSIX_C_SOURCE 199309L |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #include <immintrin.h> |
| #include <omp.h> |
| #include <stdint.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <math.h> |
| #include <stdio.h> |
| #include <time.h> |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| typedef struct { |
| uint64_t *sign; |
| uint64_t *planes; |
| int dim; |
| int chunks; |
| int n_planes; |
| int bias; |
| float base_scale; |
| } LogUnaryTensor; |
|
|
| |
| typedef struct { |
| uint64_t *sign; |
| uint64_t *planes; |
| float *row_scales; |
| int rows; |
| int cols; |
| int chunks; |
| int n_planes; |
| int bias; |
| } LogUnaryMatrix; |
|
|
| |
| |
| |
| LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias) { |
| LogUnaryTensor *t = (LogUnaryTensor *)calloc(1, sizeof(LogUnaryTensor)); |
| t->dim = dim; |
| t->n_planes = n_planes; |
| t->bias = bias; |
| t->chunks = (dim + 63) / 64; |
| t->base_scale = 1.0f; |
| t->sign = (uint64_t *)aligned_alloc(64, t->chunks * sizeof(uint64_t)); |
| t->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * t->chunks * sizeof(uint64_t)); |
| memset(t->sign, 0, t->chunks * sizeof(uint64_t)); |
| memset(t->planes, 0, (size_t)n_planes * t->chunks * sizeof(uint64_t)); |
| return t; |
| } |
|
|
| LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias) { |
| LogUnaryMatrix *m = (LogUnaryMatrix *)calloc(1, sizeof(LogUnaryMatrix)); |
| m->rows = rows; |
| m->cols = cols; |
| m->n_planes = n_planes; |
| m->bias = bias; |
| m->chunks = (cols + 63) / 64; |
| m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t)); |
| m->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * rows * m->chunks * sizeof(uint64_t)); |
| m->row_scales = (float *)aligned_alloc(64, rows * sizeof(float)); |
| memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t)); |
| memset(m->planes, 0, (size_t)n_planes * rows * m->chunks * sizeof(uint64_t)); |
| for (int i = 0; i < rows; i++) m->row_scales[i] = 1.0f; |
| return m; |
| } |
|
|
| void lut_free(LogUnaryTensor *t) { |
| if (t) { free(t->sign); free(t->planes); free(t); } |
| } |
| void lum_free(LogUnaryMatrix *m) { |
| if (m) { free(m->sign); free(m->planes); free(m->row_scales); free(m); } |
| } |
|
|
| |
| |
| |
| |
| void lut_from_float(LogUnaryTensor *t, const float *x) { |
| int dim = t->dim; |
| int np = t->n_planes; |
| int bias = t->bias; |
| int chunks = t->chunks; |
|
|
| memset(t->sign, 0, chunks * sizeof(uint64_t)); |
| memset(t->planes, 0, (size_t)np * chunks * sizeof(uint64_t)); |
|
|
| |
| float amax = 0.0f; |
| for (int i = 0; i < dim; i++) { |
| float a = fabsf(x[i]); |
| if (a > amax) amax = a; |
| } |
| if (amax == 0.0f) { t->base_scale = 1.0f; return; } |
|
|
| |
| |
| t->base_scale = amax / ldexpf(1.0f, np - 1 - bias); |
|
|
| for (int i = 0; i < dim; i++) { |
| int c = i / 64; |
| uint64_t bit = 1ULL << (i % 64); |
|
|
| if (x[i] < 0.0f) t->sign[c] |= bit; |
|
|
| float mag = fabsf(x[i]); |
| |
| for (int p = 0; p < np; p++) { |
| float thresh = t->base_scale * ldexpf(1.0f, p - bias); |
| if (mag >= thresh) |
| t->planes[(size_t)p * chunks + c] |= bit; |
| else |
| break; |
| } |
| } |
| } |
|
|
| void lut_to_float(const LogUnaryTensor *t, float *out) { |
| int dim = t->dim; |
| int np = t->n_planes; |
| int bias = t->bias; |
| int chunks = t->chunks; |
|
|
| memset(out, 0, dim * sizeof(float)); |
|
|
| for (int i = 0; i < dim; i++) { |
| int c = i / 64; |
| uint64_t bit = 1ULL << (i % 64); |
|
|
| |
| int highest = -1; |
| for (int p = np - 1; p >= 0; p--) { |
| if (t->planes[(size_t)p * chunks + c] & bit) { |
| highest = p; |
| break; |
| } |
| } |
|
|
| if (highest < 0) { |
| out[i] = 0.0f; |
| } else { |
| |
| |
| float val = t->base_scale * ldexpf(1.0f, highest - bias); |
| if (highest < np - 1) { |
| float next = t->base_scale * ldexpf(1.0f, highest + 1 - bias); |
| val = (val + next) * 0.5f; |
| } |
| out[i] = (t->sign[c] & bit) ? -val : val; |
| } |
| } |
| } |
|
|
| |
| void lum_from_float(LogUnaryMatrix *m, const float *data) { |
| int rows = m->rows, cols = m->cols; |
| int np = m->n_planes, bias = m->bias; |
| int chunks = m->chunks; |
|
|
| memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t)); |
| memset(m->planes, 0, (size_t)np * rows * chunks * sizeof(uint64_t)); |
|
|
| for (int r = 0; r < rows; r++) { |
| const float *row = data + (size_t)r * cols; |
|
|
| |
| float amax = 0.0f; |
| for (int j = 0; j < cols; j++) { |
| float a = fabsf(row[j]); |
| if (a > amax) amax = a; |
| } |
| if (amax == 0.0f) { m->row_scales[r] = 1.0f; continue; } |
| m->row_scales[r] = amax / ldexpf(1.0f, np - 1 - bias); |
|
|
| uint64_t *row_sign = m->sign + (size_t)r * chunks; |
|
|
| for (int j = 0; j < cols; j++) { |
| int c = j / 64; |
| uint64_t bit = 1ULL << (j % 64); |
|
|
| if (row[j] < 0.0f) row_sign[c] |= bit; |
|
|
| float mag = fabsf(row[j]); |
| for (int p = 0; p < np; p++) { |
| float thresh = m->row_scales[r] * ldexpf(1.0f, p - bias); |
| if (mag >= thresh) |
| m->planes[((size_t)p * rows + r) * chunks + c] |= bit; |
| else |
| break; |
| } |
| } |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| void lum_matvec( |
| const LogUnaryMatrix *M, |
| const LogUnaryTensor *x, |
| LogUnaryTensor *y_out |
| ) { |
| int out_dim = M->rows; |
| int chunks = M->chunks; |
| int wp = M->n_planes; |
| int xp = x->n_planes; |
| int w_bias = M->bias; |
| int x_bias = x->bias; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| int base_shift = w_bias + x_bias; |
|
|
| |
| |
|
|
| float *y_float = (float *)aligned_alloc(64, out_dim * sizeof(float)); |
|
|
| #pragma omp parallel for schedule(dynamic, 32) |
| for (int i = 0; i < out_dim; i++) { |
| const uint64_t *w_sign_row = M->sign + (size_t)i * chunks; |
| long long acc = 0; |
|
|
| for (int c = 0; c < chunks; c++) { |
| uint64_t ws = w_sign_row[c]; |
| uint64_t xs = x->sign[c]; |
| uint64_t same = ~(ws ^ xs); |
| uint64_t diff = ws ^ xs; |
|
|
| for (int p = 0; p < wp; p++) { |
| uint64_t w_plane = M->planes[((size_t)p * out_dim + i) * chunks + c]; |
|
|
| for (int q = 0; q < xp; q++) { |
| uint64_t x_plane = x->planes[(size_t)q * chunks + c]; |
| uint64_t active = w_plane & x_plane; |
| uint64_t pos = active & same; |
| uint64_t neg = active & diff; |
|
|
| int count = __builtin_popcountll(pos) - __builtin_popcountll(neg); |
|
|
| |
| int shift = p + q; |
| if (count != 0) |
| acc += (long long)count << shift; |
| } |
| } |
| } |
|
|
| |
| y_float[i] = (float)acc * M->row_scales[i] * x->base_scale |
| * ldexpf(1.0f, -base_shift); |
| } |
|
|
| |
| lut_from_float(y_out, y_float); |
| free(y_float); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| void lut_add(const LogUnaryTensor *a, const LogUnaryTensor *b, LogUnaryTensor *out) { |
| int dim = a->dim; |
| float *fa = (float *)aligned_alloc(64, dim * sizeof(float)); |
| float *fb = (float *)aligned_alloc(64, dim * sizeof(float)); |
|
|
| lut_to_float(a, fa); |
| lut_to_float(b, fb); |
|
|
| for (int i = 0; i < dim; i++) fa[i] += fb[i]; |
|
|
| lut_from_float(out, fa); |
| free(fa); free(fb); |
| } |
|
|
| |
| void lut_add_float(LogUnaryTensor *a, const float *b) { |
| int dim = a->dim; |
| float *fa = (float *)aligned_alloc(64, dim * sizeof(float)); |
| lut_to_float(a, fa); |
| for (int i = 0; i < dim; i++) fa[i] += b[i]; |
| lut_from_float(a, fa); |
| free(fa); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| void lut_rmsnorm( |
| const LogUnaryTensor *x, |
| const float *weight, |
| LogUnaryTensor *out, |
| float eps |
| ) { |
| int dim = x->dim; |
| float *xf = (float *)aligned_alloc(64, dim * sizeof(float)); |
| lut_to_float(x, xf); |
|
|
| float ss = 0.0f; |
| for (int i = 0; i < dim; i++) ss += xf[i] * xf[i]; |
| float rms = 1.0f / sqrtf(ss / dim + eps); |
|
|
| for (int i = 0; i < dim; i++) xf[i] = xf[i] * rms * weight[i]; |
|
|
| lut_from_float(out, xf); |
| free(xf); |
| } |
|
|
| |
| |
| |
| |
| |
| void lut_silu_mul( |
| const LogUnaryTensor *gate, |
| const LogUnaryTensor *up, |
| LogUnaryTensor *out |
| ) { |
| int dim = gate->dim; |
| float *gf = (float *)aligned_alloc(64, dim * sizeof(float)); |
| float *uf = (float *)aligned_alloc(64, dim * sizeof(float)); |
|
|
| lut_to_float(gate, gf); |
| lut_to_float(up, uf); |
|
|
| for (int i = 0; i < dim; i++) |
| gf[i] = (gf[i] / (1.0f + expf(-gf[i]))) * uf[i]; |
|
|
| lut_from_float(out, gf); |
| free(gf); free(uf); |
| } |
|
|
| |
| |
| |
| |
| |
| void lut_rope(LogUnaryTensor *t, int offset, int start, int head_dim, float theta) { |
| |
| float *f = (float *)aligned_alloc(64, head_dim * sizeof(float)); |
|
|
| |
| float *full = (float *)aligned_alloc(64, t->dim * sizeof(float)); |
| lut_to_float(t, full); |
| memcpy(f, full + start, head_dim * sizeof(float)); |
|
|
| for (int i = 0; i < head_dim; i += 2) { |
| float freq = 1.0f / powf(theta, (float)i / head_dim); |
| float angle = offset * freq; |
| float c = cosf(angle), s = sinf(angle); |
| float v0 = f[i], v1 = f[i + 1]; |
| f[i] = v0 * c - v1 * s; |
| f[i + 1] = v0 * s + v1 * c; |
| } |
|
|
| memcpy(full + start, f, head_dim * sizeof(float)); |
| lut_from_float(t, full); |
| free(f); free(full); |
| } |
|
|
| |
| |
| |
| |
| void lut_to_float_slice(const LogUnaryTensor *t, int start, int len, float *out) { |
| float *full = (float *)aligned_alloc(64, t->dim * sizeof(float)); |
| lut_to_float(t, full); |
| memcpy(out, full + start, len * sizeof(float)); |
| free(full); |
| } |
|
|
| |
| |
| |
| typedef struct { |
| double total_and_ops; |
| double total_popcount_ops; |
| double wall_time_s; |
| double elements_per_sec; |
| double gops; |
| } BenchResult; |
|
|
| BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters) { |
| LogUnaryMatrix *M = lum_alloc(rows, cols, w_planes, bias); |
| LogUnaryTensor *x = lut_alloc(cols, x_planes, bias); |
| LogUnaryTensor *y = lut_alloc(rows, x_planes, bias); |
|
|
| |
| for (size_t i = 0; i < (size_t)rows * M->chunks; i++) |
| M->sign[i] = ((uint64_t)rand() << 32) | rand(); |
| for (size_t i = 0; i < (size_t)w_planes * rows * M->chunks; i++) |
| M->planes[i] = ((uint64_t)rand() << 32) | rand(); |
| for (int i = 0; i < rows; i++) M->row_scales[i] = 1.0f; |
| for (size_t i = 0; i < (size_t)x->chunks; i++) |
| x->sign[i] = ((uint64_t)rand() << 32) | rand(); |
| for (size_t i = 0; i < (size_t)x_planes * x->chunks; i++) |
| x->planes[i] = ((uint64_t)rand() << 32) | rand(); |
| x->base_scale = 1.0f; |
|
|
| |
| lum_matvec(M, x, y); |
|
|
| struct timespec t0, t1; |
| clock_gettime(CLOCK_MONOTONIC, &t0); |
| for (int i = 0; i < iters; i++) |
| lum_matvec(M, x, y); |
| clock_gettime(CLOCK_MONOTONIC, &t1); |
|
|
| double dt = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9; |
| int chunks = M->chunks; |
| double ops_per_call = (double)rows * chunks * w_planes * x_planes * 2; |
|
|
| BenchResult r; |
| r.wall_time_s = dt / iters; |
| r.total_and_ops = ops_per_call; |
| r.total_popcount_ops = ops_per_call; |
| r.elements_per_sec = (double)rows * cols * iters / dt; |
| r.gops = ops_per_call * iters / dt / 1e9; |
|
|
| lum_free(M); lut_free(x); lut_free(y); |
| return r; |
| } |
|
|
| |
| |
| |
| typedef struct { |
| float max_error; |
| float mean_error; |
| float cosine_sim; |
| float snr_db; |
| } AccuracyResult; |
|
|
| AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias) { |
| float *original = (float *)aligned_alloc(64, dim * sizeof(float)); |
| float *recovered = (float *)aligned_alloc(64, dim * sizeof(float)); |
|
|
| |
| for (int i = 0; i < dim; i++) { |
| float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| original[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); |
| } |
|
|
| LogUnaryTensor *t = lut_alloc(dim, n_planes, bias); |
| lut_from_float(t, original); |
| lut_to_float(t, recovered); |
|
|
| float max_err = 0, sum_err = 0; |
| float dot = 0, na = 0, nb = 0; |
| for (int i = 0; i < dim; i++) { |
| float err = fabsf(original[i] - recovered[i]); |
| if (err > max_err) max_err = err; |
| sum_err += err; |
| dot += original[i] * recovered[i]; |
| na += original[i] * original[i]; |
| nb += recovered[i] * recovered[i]; |
| } |
|
|
| float noise_power = 0; |
| for (int i = 0; i < dim; i++) { |
| float e = original[i] - recovered[i]; |
| noise_power += e * e; |
| } |
|
|
| AccuracyResult r; |
| r.max_error = max_err; |
| r.mean_error = sum_err / dim; |
| r.cosine_sim = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f); |
| r.snr_db = 10.0f * log10f(na / (noise_power + 1e-10f)); |
|
|
| lut_free(t); |
| free(original); free(recovered); |
| return r; |
| } |
|
|