kernels-community
/

rwkv

Model card Files Files and versions

rwkv / benchmarks /benchmark.py

danieldk's picture

danieldk HF Staff

Benchmarks uploaded using `kernels`.

adc211c verified 16 days ago

history blame contribute delete

2.66 kB

	import torch

	from kernels.benchmark import Benchmark


	def rwkv_wkv_reference(
	w: torch.Tensor, u: torch.Tensor, k: torch.Tensor, v: torch.Tensor
	) -> torch.Tensor:
	B, T, C = k.shape
	device = k.device
	dtype = k.dtype

	y = torch.zeros(B, T, C, device=device, dtype=dtype)

	# State: accumulated numerator, denominator, and max exponent
	aa = torch.zeros(B, C, device=device, dtype=torch.float32)
	bb = torch.zeros(B, C, device=device, dtype=torch.float32)
	pp = torch.full((B, C), -1e38, device=device, dtype=torch.float32)

	w = w.float()
	u = u.float()

	for t in range(T):
	kt = k[:, t, :].float() # [B, C]
	vt = v[:, t, :].float() # [B, C]

	# Output computation
	ww = u + kt
	p = torch.maximum(pp, ww)
	e1 = torch.exp(pp - p)
	e2 = torch.exp(ww - p)
	y[:, t, :] = ((e1 * aa + e2 * vt) / (e1 * bb + e2)).to(dtype)

	# State update (note: w + pp, not pp - w)
	ww = w + pp
	p = torch.maximum(ww, kt)
	e1 = torch.exp(ww - p)
	e2 = torch.exp(kt - p)
	aa = e1 * aa + e2 * vt
	bb = e1 * bb + e2
	pp = p

	return y


	class RwkvBenchmark(Benchmark):
	seed: int = 42

	def setup(self):
	B, T, C = 2, 64, 256

	self.w = torch.randn(
	C, device=self.device, dtype=torch.float32
	).abs() # Decay should be positive
	self.u = torch.randn(C, device=self.device, dtype=torch.float32)
	self.k = torch.randn(B, T, C, device=self.device, dtype=torch.float32) * 0.1
	self.v = torch.randn(B, T, C, device=self.device, dtype=torch.float32) * 0.1
	self.out = torch.zeros(B, T, C, device=self.device, dtype=torch.float32)

	def benchmark_base(self):
	self.out.zero_()
	self.kernel.forward(self.w, self.u, self.k, self.v, self.out)

	def verify_base(self) -> torch.Tensor:
	return rwkv_wkv_reference(self.w, self.u, self.k, self.v)

	def setup_large(self):
	B, T, C = 8, 256, 512

	self.w = torch.randn(C, device=self.device, dtype=torch.float32).abs()
	self.u = torch.randn(C, device=self.device, dtype=torch.float32)
	self.k = torch.randn(B, T, C, device=self.device, dtype=torch.float32) * 0.1
	self.v = torch.randn(B, T, C, device=self.device, dtype=torch.float32) * 0.1
	self.out = torch.zeros(B, T, C, device=self.device, dtype=torch.float32)

	def benchmark_large(self):
	self.out.zero_()
	self.kernel.forward(self.w, self.u, self.k, self.v, self.out)

	def verify_large(self) -> torch.Tensor:
	return rwkv_wkv_reference(self.w, self.u, self.k, self.v)