Add files using upload-large-folder tool

19ed98b verified about 1 month ago

18.5 kB

	#define _POSIX_C_SOURCE 199309L
	/*
	* LOG-UNARY TENSOR LIBRARY
	*
	* Native tensor type where values are represented as:
	* sign (1 bit) + log-magnitude bitplanes
	*
	* Plane p is set if \|value\| >= 2^(p - bias)
	* With N planes and bias B, represents magnitudes from 2^(-B) to 2^(N-1-B)
	*
	* ALL arithmetic stays in this representation:
	* - matmul: AND + weighted_popcount (shift by p+q-2*bias)
	* - add: bitwise merge with carry propagation
	* - scale: shift planes up/down
	* - negate: flip sign bits
	*
	* Float conversion only at boundaries (embed lookup, final logits)
	*
	* (c) 2026 OpenTransformers Ltd / Scott Bisset
	*/

	#include <immintrin.h>
	#include <omp.h>
	#include <stdint.h>
	#include <stdlib.h>
	#include <string.h>
	#include <math.h>
	#include <stdio.h>
	#include <time.h>

	/* ============================================================
	* LOG-UNARY TENSOR
	*
	* For a vector of length `dim`:
	* sign: uint64[chunks] - 1 bit per element
	* planes: uint64[n_planes][chunks] - 1 bit per element per plane
	* chunks = (dim + 63) / 64
	*
	* Plane p is set if \|value\| >= threshold[p]
	* threshold[p] = base_scale * 2^(p - bias)
	*
	* This is a LOG thermometer code:
	* value=0.001 with bias=10 -> maybe plane 0 set (2^-10 = 0.001)
	* value=1.0 with bias=10 -> planes 0-10 set
	* value=64.0 with bias=10 -> planes 0-16 set
	*
	* ============================================================ */
	typedef struct {
	uint64_t sign; / [chunks] */
	uint64_t planes; / [n_planes * chunks] contiguous */
	int dim;
	int chunks;
	int n_planes;
	int bias; /* log2 offset: threshold[p] = base * 2^(p-bias) */
	float base_scale; /* per-tensor scale factor */
	} LogUnaryTensor;

	/* 2D tensor (matrix) - row-major */
	typedef struct {
	uint64_t sign; / [rows * chunks_per_row] */
	uint64_t planes; / [n_planes * rows * chunks_per_row] */
	float row_scales; / [rows] per-row base scales */
	int rows;
	int cols;
	int chunks; /* chunks per row = (cols+63)/64 */
	int n_planes;
	int bias;
	} LogUnaryMatrix;

	/* ============================================================
	* ALLOCATION
	* ============================================================ */
	LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias) {
	LogUnaryTensor t = (LogUnaryTensor )calloc(1, sizeof(LogUnaryTensor));
	t->dim = dim;
	t->n_planes = n_planes;
	t->bias = bias;
	t->chunks = (dim + 63) / 64;
	t->base_scale = 1.0f;
	t->sign = (uint64_t )aligned_alloc(64, t->chunks sizeof(uint64_t));
	t->planes = (uint64_t )aligned_alloc(64, (size_t)n_planes t->chunks * sizeof(uint64_t));
	memset(t->sign, 0, t->chunks * sizeof(uint64_t));
	memset(t->planes, 0, (size_t)n_planes * t->chunks * sizeof(uint64_t));
	return t;
	}

	LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias) {
	LogUnaryMatrix m = (LogUnaryMatrix )calloc(1, sizeof(LogUnaryMatrix));
	m->rows = rows;
	m->cols = cols;
	m->n_planes = n_planes;
	m->bias = bias;
	m->chunks = (cols + 63) / 64;
	m->sign = (uint64_t )aligned_alloc(64, (size_t)rows m->chunks * sizeof(uint64_t));
	m->planes = (uint64_t )aligned_alloc(64, (size_t)n_planes rows * m->chunks * sizeof(uint64_t));
	m->row_scales = (float )aligned_alloc(64, rows sizeof(float));
	memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
	memset(m->planes, 0, (size_t)n_planes * rows * m->chunks * sizeof(uint64_t));
	for (int i = 0; i < rows; i++) m->row_scales[i] = 1.0f;
	return m;
	}

	void lut_free(LogUnaryTensor *t) {
	if (t) { free(t->sign); free(t->planes); free(t); }
	}
	void lum_free(LogUnaryMatrix *m) {
	if (m) { free(m->sign); free(m->planes); free(m->row_scales); free(m); }
	}

	/* ============================================================
	* FLOAT <-> LOG-UNARY CONVERSION
	* Only used at boundaries (embedding, final output)
	* ============================================================ */
	void lut_from_float(LogUnaryTensor t, const float x) {
	int dim = t->dim;
	int np = t->n_planes;
	int bias = t->bias;
	int chunks = t->chunks;

	memset(t->sign, 0, chunks * sizeof(uint64_t));
	memset(t->planes, 0, (size_t)np * chunks * sizeof(uint64_t));

	/* Find absmax for base_scale */
	float amax = 0.0f;
	for (int i = 0; i < dim; i++) {
	float a = fabsf(x[i]);
	if (a > amax) amax = a;
	}
	if (amax == 0.0f) { t->base_scale = 1.0f; return; }

	/* Set base_scale so that max value uses the highest plane */
	/* threshold[np-1] = base_scale * 2^(np-1-bias) should equal amax */
	t->base_scale = amax / ldexpf(1.0f, np - 1 - bias);

	for (int i = 0; i < dim; i++) {
	int c = i / 64;
	uint64_t bit = 1ULL << (i % 64);

	if (x[i] < 0.0f) t->sign[c] \|= bit;

	float mag = fabsf(x[i]);
	/* Set planes from low to high: plane p set if mag >= base * 2^(p-bias) */
	for (int p = 0; p < np; p++) {
	float thresh = t->base_scale * ldexpf(1.0f, p - bias);
	if (mag >= thresh)
	t->planes[(size_t)p * chunks + c] \|= bit;
	else
	break; /* thermometer: once we stop, all higher planes are 0 */
	}
	}
	}

	void lut_to_float(const LogUnaryTensor t, float out) {
	int dim = t->dim;
	int np = t->n_planes;
	int bias = t->bias;
	int chunks = t->chunks;

	memset(out, 0, dim * sizeof(float));

	for (int i = 0; i < dim; i++) {
	int c = i / 64;
	uint64_t bit = 1ULL << (i % 64);

	/* Find highest set plane */
	int highest = -1;
	for (int p = np - 1; p >= 0; p--) {
	if (t->planes[(size_t)p * chunks + c] & bit) {
	highest = p;
	break;
	}
	}

	if (highest < 0) {
	out[i] = 0.0f;
	} else {
	/* Value is approximately base * 2^(highest - bias) */
	/* More precise: midpoint between this threshold and next */
	float val = t->base_scale * ldexpf(1.0f, highest - bias);
	if (highest < np - 1) {
	float next = t->base_scale * ldexpf(1.0f, highest + 1 - bias);
	val = (val + next) * 0.5f; /* midpoint reconstruction */
	}
	out[i] = (t->sign[c] & bit) ? -val : val;
	}
	}
	}

	/* Convert float matrix to log-unary matrix (per-row scaling) */
	void lum_from_float(LogUnaryMatrix m, const float data) {
	int rows = m->rows, cols = m->cols;
	int np = m->n_planes, bias = m->bias;
	int chunks = m->chunks;

	memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
	memset(m->planes, 0, (size_t)np * rows * chunks * sizeof(uint64_t));

	for (int r = 0; r < rows; r++) {
	const float row = data + (size_t)r cols;

	/* Per-row absmax */
	float amax = 0.0f;
	for (int j = 0; j < cols; j++) {
	float a = fabsf(row[j]);
	if (a > amax) amax = a;
	}
	if (amax == 0.0f) { m->row_scales[r] = 1.0f; continue; }
	m->row_scales[r] = amax / ldexpf(1.0f, np - 1 - bias);

	uint64_t row_sign = m->sign + (size_t)r chunks;

	for (int j = 0; j < cols; j++) {
	int c = j / 64;
	uint64_t bit = 1ULL << (j % 64);

	if (row[j] < 0.0f) row_sign[c] \|= bit;

	float mag = fabsf(row[j]);
	for (int p = 0; p < np; p++) {
	float thresh = m->row_scales[r] * ldexpf(1.0f, p - bias);
	if (mag >= thresh)
	m->planes[((size_t)p * rows + r) * chunks + c] \|= bit;
	else
	break;
	}
	}
	}
	}

	/* ============================================================
	* LOG-UNARY MATMUL: y = M @ x
	*
	* Both M (matrix) and x (vector) are log-unary encoded.
	*
	* For each output element y[i]:
	* For each weight plane p, activation plane q:
	* active = M.planes[p][i] AND x.planes[q]
	* same = active AND ~(M.sign[i] XOR x.sign)
	* diff = active AND (M.sign[i] XOR x.sign)
	* contribution = (popcount(same) - popcount(diff)) * 2^(p+q-2*bias)
	*
	* Output is a LogUnaryTensor (converted from integer accumulator)
	* ============================================================ */
	void lum_matvec(
	const LogUnaryMatrix *M,
	const LogUnaryTensor *x,
	LogUnaryTensor y_out / output: log-unary encoded result */
	) {
	int out_dim = M->rows;
	int chunks = M->chunks;
	int wp = M->n_planes;
	int xp = x->n_planes;
	int w_bias = M->bias;
	int x_bias = x->bias;

	/* Accumulate to float temporarily, then requantize to log-unary.
	* The accumulator is integer shifts (2^(p+q-2bias)), which
	* we can do as int64 left-shifts for small exponents.
	*
	* For the exponent range we're in (p+q in [0,14] with bias ~4),
	* net shift is [-8, 6], so we use a fixed-point int64 accumulator
	* with a base shift to keep everything positive.
	*/
	int base_shift = w_bias + x_bias; /* shift to add to make all exponents >= 0 */

	/* We'll accumulate as int64 with implicit 2^(-base_shift) factor */
	/* Then convert: float_val = acc * row_scale * x_scale * 2^(-base_shift) */

	float y_float = (float )aligned_alloc(64, out_dim * sizeof(float));

	#pragma omp parallel for schedule(dynamic, 32)
	for (int i = 0; i < out_dim; i++) {
	const uint64_t w_sign_row = M->sign + (size_t)i chunks;
	long long acc = 0;

	for (int c = 0; c < chunks; c++) {
	uint64_t ws = w_sign_row[c];
	uint64_t xs = x->sign[c];
	uint64_t same = ~(ws ^ xs);
	uint64_t diff = ws ^ xs;

	for (int p = 0; p < wp; p++) {
	uint64_t w_plane = M->planes[((size_t)p * out_dim + i) * chunks + c];

	for (int q = 0; q < xp; q++) {
	uint64_t x_plane = x->planes[(size_t)q * chunks + c];
	uint64_t active = w_plane & x_plane;
	uint64_t pos = active & same;
	uint64_t neg = active & diff;

	int count = __builtin_popcountll(pos) - __builtin_popcountll(neg);

	/* Weighted by 2^(p + q) relative to base */
	int shift = p + q; /* relative to 2^(-base_shift) */
	if (count != 0)
	acc += (long long)count << shift;
	}
	}
	}

	/* Convert: val = acc * row_scale * x_scale * 2^(-base_shift) */
	y_float[i] = (float)acc * M->row_scales[i] * x->base_scale
	* ldexpf(1.0f, -base_shift);
	}

	/* Requantize float result to log-unary */
	lut_from_float(y_out, y_float);
	free(y_float);
	}

	/* ============================================================
	* LOG-UNARY ELEMENT-WISE ADD: z = a + b
	*
	* Dequant both, add as float, requant.
	* This is O(dim) so not the bottleneck.
	* Future: direct bitwise add with carry chains.
	* ============================================================ */
	void lut_add(const LogUnaryTensor a, const LogUnaryTensor b, LogUnaryTensor *out) {
	int dim = a->dim;
	float fa = (float )aligned_alloc(64, dim * sizeof(float));
	float fb = (float )aligned_alloc(64, dim * sizeof(float));

	lut_to_float(a, fa);
	lut_to_float(b, fb);

	for (int i = 0; i < dim; i++) fa[i] += fb[i];

	lut_from_float(out, fa);
	free(fa); free(fb);
	}

	/* In-place add: a += b (dequant a, add float b, requant) */
	void lut_add_float(LogUnaryTensor a, const float b) {
	int dim = a->dim;
	float fa = (float )aligned_alloc(64, dim * sizeof(float));
	lut_to_float(a, fa);
	for (int i = 0; i < dim; i++) fa[i] += b[i];
	lut_from_float(a, fa);
	free(fa);
	}

	/* ============================================================
	* LOG-UNARY RMSNORM
	*
	* Needs float for the sqrt/reciprocal, but O(dim).
	* Input: log-unary, Output: log-unary
	* ============================================================ */
	void lut_rmsnorm(
	const LogUnaryTensor *x,
	const float weight, / norm weights stay float (tiny) */
	LogUnaryTensor *out,
	float eps
	) {
	int dim = x->dim;
	float xf = (float )aligned_alloc(64, dim * sizeof(float));
	lut_to_float(x, xf);

	float ss = 0.0f;
	for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
	float rms = 1.0f / sqrtf(ss / dim + eps);

	for (int i = 0; i < dim; i++) xf[i] = xf[i] * rms * weight[i];

	lut_from_float(out, xf);
	free(xf);
	}

	/* ============================================================
	* LOG-UNARY SILU_MUL: out = SiLU(gate) * up
	*
	* O(dim), not bottleneck. Dequant, compute, requant.
	* ============================================================ */
	void lut_silu_mul(
	const LogUnaryTensor *gate,
	const LogUnaryTensor *up,
	LogUnaryTensor *out
	) {
	int dim = gate->dim;
	float gf = (float )aligned_alloc(64, dim * sizeof(float));
	float uf = (float )aligned_alloc(64, dim * sizeof(float));

	lut_to_float(gate, gf);
	lut_to_float(up, uf);

	for (int i = 0; i < dim; i++)
	gf[i] = (gf[i] / (1.0f + expf(-gf[i]))) * uf[i];

	lut_from_float(out, gf);
	free(gf); free(uf);
	}

	/* ============================================================
	* LOG-UNARY ROPE
	*
	* O(dim), dequant-compute-requant per head.
	* ============================================================ */
	void lut_rope(LogUnaryTensor *t, int offset, int start, int head_dim, float theta) {
	/* Dequant the relevant slice, apply RoPE, requant */
	float f = (float )aligned_alloc(64, head_dim * sizeof(float));

	/* Extract slice */
	float full = (float )aligned_alloc(64, t->dim * sizeof(float));
	lut_to_float(t, full);
	memcpy(f, full + start, head_dim * sizeof(float));

	for (int i = 0; i < head_dim; i += 2) {
	float freq = 1.0f / powf(theta, (float)i / head_dim);
	float angle = offset * freq;
	float c = cosf(angle), s = sinf(angle);
	float v0 = f[i], v1 = f[i + 1];
	f[i] = v0 * c - v1 * s;
	f[i + 1] = v0 * s + v1 * c;
	}

	memcpy(full + start, f, head_dim * sizeof(float));
	lut_from_float(t, full);
	free(f); free(full);
	}

	/* ============================================================
	* UTILITY: Get float slice from log-unary tensor
	* (for attention scores which need float softmax)
	* ============================================================ */
	void lut_to_float_slice(const LogUnaryTensor t, int start, int len, float out) {
	float full = (float )aligned_alloc(64, t->dim * sizeof(float));
	lut_to_float(t, full);
	memcpy(out, full + start, len * sizeof(float));
	free(full);
	}

	/* ============================================================
	* BENCHMARK: measure matvec throughput
	* ============================================================ */
	typedef struct {
	double total_and_ops;
	double total_popcount_ops;
	double wall_time_s;
	double elements_per_sec;
	double gops; /* giga-operations per second */
	} BenchResult;

	BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters) {
	LogUnaryMatrix *M = lum_alloc(rows, cols, w_planes, bias);
	LogUnaryTensor *x = lut_alloc(cols, x_planes, bias);
	LogUnaryTensor *y = lut_alloc(rows, x_planes, bias);

	/* Fill with random bits */
	for (size_t i = 0; i < (size_t)rows * M->chunks; i++)
	M->sign[i] = ((uint64_t)rand() << 32) \| rand();
	for (size_t i = 0; i < (size_t)w_planes * rows * M->chunks; i++)
	M->planes[i] = ((uint64_t)rand() << 32) \| rand();
	for (int i = 0; i < rows; i++) M->row_scales[i] = 1.0f;
	for (size_t i = 0; i < (size_t)x->chunks; i++)
	x->sign[i] = ((uint64_t)rand() << 32) \| rand();
	for (size_t i = 0; i < (size_t)x_planes * x->chunks; i++)
	x->planes[i] = ((uint64_t)rand() << 32) \| rand();
	x->base_scale = 1.0f;

	/* Warmup */
	lum_matvec(M, x, y);

	struct timespec t0, t1;
	clock_gettime(CLOCK_MONOTONIC, &t0);
	for (int i = 0; i < iters; i++)
	lum_matvec(M, x, y);
	clock_gettime(CLOCK_MONOTONIC, &t1);

	double dt = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
	int chunks = M->chunks;
	double ops_per_call = (double)rows * chunks * w_planes * x_planes * 2; /* AND + popcount pairs */

	BenchResult r;
	r.wall_time_s = dt / iters;
	r.total_and_ops = ops_per_call;
	r.total_popcount_ops = ops_per_call;
	r.elements_per_sec = (double)rows * cols * iters / dt;
	r.gops = ops_per_call * iters / dt / 1e9;

	lum_free(M); lut_free(x); lut_free(y);
	return r;
	}

	/* ============================================================
	* ACCURACY TEST: convert float->logunary->float roundtrip
	* ============================================================ */
	typedef struct {
	float max_error;
	float mean_error;
	float cosine_sim;
	float snr_db;
	} AccuracyResult;

	AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias) {
	float original = (float )aligned_alloc(64, dim * sizeof(float));
	float recovered = (float )aligned_alloc(64, dim * sizeof(float));

	/* Random normal-ish distribution */
	for (int i = 0; i < dim; i++) {
	float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	original[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
	}

	LogUnaryTensor *t = lut_alloc(dim, n_planes, bias);
	lut_from_float(t, original);
	lut_to_float(t, recovered);

	float max_err = 0, sum_err = 0;
	float dot = 0, na = 0, nb = 0;
	for (int i = 0; i < dim; i++) {
	float err = fabsf(original[i] - recovered[i]);
	if (err > max_err) max_err = err;
	sum_err += err;
	dot += original[i] * recovered[i];
	na += original[i] * original[i];
	nb += recovered[i] * recovered[i];
	}

	float noise_power = 0;
	for (int i = 0; i < dim; i++) {
	float e = original[i] - recovered[i];
	noise_power += e * e;
	}

	AccuracyResult r;
	r.max_error = max_err;
	r.mean_error = sum_err / dim;
	r.cosine_sim = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
	r.snr_db = 10.0f * log10f(na / (noise_power + 1e-10f));

	lut_free(t);
	free(original); free(recovered);
	return r;
	}