Spaces:

radames
/

Gradio-llama2.mojo

Runtime error

App Files Files Community

Gradio-llama2.mojo / llama2.mojo

radames

0021428 9 months ago

raw

history blame contribute delete

No virus

34.3 kB

	from algorithm import sum
	from algorithm import vectorize, parallelize
	from builtin import string
	from math import round
	from memory import memset_zero, memcpy
	from memory.buffer import Buffer
	from memory.unsafe import DTypePointer
	from python import Python
	from random import rand
	from read import BufReader, File
	from runtime.llcl import num_cores, Runtime
	from sys import argv
	from tensor import Tensor, TensorShape, TensorSpec

	# The SIMD vector width.
	from sys.info import simdwidthof
	import math
	import os
	import random
	import time

	var workers = 0

	alias nelts = (2*simdwidthof[DType.float32]())

	alias PointerString = Pointer[UInt8]
	alias BufferPtrType = DTypePointer[DType.uint8]
	alias BufferPtrFloat32 = DTypePointer[DType.float32]
	alias PointerStrings = Pointer[PointerString]
	alias TensorF32 = Tensor[DType.float32]


	struct TensorSlice:
	# Provides a view into a tensor representing a 1D slice on its first or first 2 dimensions.
	# Same function signatures as Tensor but without owning the data.
	var _data: BufferPtrFloat32
	var _shape: TensorShape

	fn __init__(inout self, t: TensorF32, layer: Int) raises:
	let elements_per_layer = t.num_elements() // t.dim(0)
	self._data = t.data().offset(layer * elements_per_layer)
	if t.rank() == 2:
	self._shape = TensorShape(t.dim(1))
	elif t.rank() == 3:
	self._shape = TensorShape(t.dim(1), t.dim(2))
	else:
	# Compiler complains if _shape not defined
	self._shape = TensorShape(1)
	raise Error("TensorSlice: rank greater than 3 not implemented.")

	fn __init__(inout self, t: TensorF32, layer: Int, row: Int) raises:
	let elements_per_layer = t.num_elements() // t.dim(0)
	let elements_per_row = elements_per_layer // t.dim(1)
	self._data = t.data().offset(
	layer * elements_per_layer + row * elements_per_row
	)
	if t.rank() == 3:
	self._shape = TensorShape(t.dim(2))
	elif t.rank() == 1:
	# Compiler complains if _shape not defined
	self._shape = TensorShape(1)
	raise Error(
	"Trying to slice a 1D Tensor by layer and row. This requires a 3D"
	" Tensor."
	)
	else:
	# Compiler complains if _shape not defined
	self._shape = TensorShape(1)
	raise Error("TensorSlice: rank greater than 3 not implemented.")

	fn data(self) -> BufferPtrFloat32:
	return self._data

	fn shape(self) -> TensorShape:
	return self._shape

	fn num_elements(self) -> Int:
	return self._shape.num_elements()

	fn dim(self, idx: Int) -> Int:
	return self._shape[idx]

	fn rank(self) -> Int:
	return self._shape.rank()

	fn simd_load[nelts: Int](self, idx: Int) -> SIMD[DType.float32, nelts]:
	return self._data.simd_load[nelts](idx)

	fn simd_load[nelts: Int](self, *indices: Int) -> SIMD[DType.float32, nelts]:
	if len(VariadicList(indices)) > 2:
	print(
	"Warning: TensorSlice only supports 1D and 2D indexing. Results are"
	" unlikely to be correct."
	)
	return self.simd_load[nelts](indices[0] * self._shape[1] + indices[1])

	fn simd_load[
	nelts: Int
	](self, indices: StaticIntTuple[2]) -> SIMD[DType.float32, nelts]:
	return self._data.simd_load[nelts](indices[0] * self._shape[1] + indices[1])

	fn __getitem__(self, idx: Int) -> SIMD[DType.float32, 1]:
	return self._data.simd_load[1](idx)

	fn simd_store[nelts: Int](self, idx: Int, val: SIMD[DType.float32, nelts]):
	return self._data.simd_store[nelts](idx, val)

	fn __setitem__(self, idx: Int, val: SIMD[DType.float32, 1]):
	return self.simd_store[1](idx, val)


	fn read_val_int(inout buf: FileBuf) raises -> Int:
	# DTypePointer[DType.ui8](buf.data).bitcast[DType.ui8]()
	let data = buf.data.offset(buf.get_offset()).bitcast[DType.int32]()
	let result = data.load(0)
	buf.move_offset(4)
	return result.to_int()


	fn read_val_float32(inout buf: FileBuf) raises -> Float32:
	# DTypePointer[DType.ui8](buf.data).bitcast[DType.ui8]()
	let val = buf.data.offset(buf.get_offset()).bitcast[DType.float32]().load(0)
	buf.move_offset(4)
	return val


	fn read_val_str(inout buf: FileBuf, slen: Int) raises -> PointerString:
	let str = PointerString.alloc(slen + 1)
	for i in range(slen):
	str.store(i, buf.data.load(buf.get_offset()))
	buf.move_offset(1)
	str.store(slen, 0)

	return str


	fn str_len(s: PointerString) -> Int:
	var len = 0
	while s[len] != 0:
	len += 1
	return len


	# not optimal concat
	fn str_concat(s1: PointerString, s2: PointerString) -> PointerString:
	let l1 = str_len(s1)
	let l2 = str_len(s2)
	let str = PointerString.alloc(l1 + l2 + 1)
	memcpy[UInt8](str, s1, l1)
	memcpy[UInt8](str.offset(l1), s2, l2)
	str.store(l1 + l2, 0)
	return str


	fn str_to_ptr(s: String) -> PointerString:
	let ret = PointerString.alloc(len(s) + 1)
	for i in range(len(s)):
	ret.store(i, ord(s[i]))
	ret.store(len(s), 0)
	return ret


	fn string_compare(a: PointerString, b: PointerString) -> Int:
	var index = 0
	while a[index] != 0 and b[index] != 0:
	if a[index] < b[index]:
	return -1
	if a[index] > b[index]:
	return 1

	index += 1

	if a[index] != 0 and b[index] == 0:
	return 1

	if a[index] == 0 and b[index] != 0:
	return -1

	return 0


	# Quicksort helper function to find the partition position
	fn partition(
	inout array: PointerStrings, inout indices: DynamicVector[Int], low: Int, high: Int
	) -> Int:
	let pivot = array[high]
	var ii = low - 1
	for jj in range(low, high):
	if string_compare(pivot, array[jj]) == 1:
	# If element smaller than pivot, swap
	ii = ii + 1

	let tmp = array[ii]
	let tmp_idx = indices[ii]
	array.store(ii, array[jj])
	indices[ii] = indices[jj]
	array.store(jj, tmp)
	indices[jj] = tmp_idx

	# Swap the pivot element
	let tmp = array[ii + 1]
	let tmp_idx = indices[ii + 1]
	array.store(ii + 1, array[high])
	indices[ii + 1] = indices[high]
	array.store(high, tmp)
	indices[high] = tmp_idx

	return ii + 1


	fn quicksort(
	inout array: PointerStrings, inout indices: DynamicVector[Int], low: Int, high: Int
	):
	if low < high:
	let pi = partition(array, indices, low, high)
	quicksort(array, indices, low, pi - 1)
	quicksort(array, indices, pi + 1, high)


	struct FileBuf:
	var data: BufferPtrType
	var offset: Int
	var size: Int

	fn __init__(inout self):
	self.data = BufferPtrType()
	self.offset = 0
	self.size = 0

	fn __del__(owned self):
	self.data.free()

	fn move_offset(inout self, size: Int) raises:
	let new_offset = self.offset + size
	if new_offset > self.size:
	raise Error("Resulting offset will be past the end of the FileBuf")
	if new_offset < 0:
	raise Error("Resulting offset will be before the beginning of the FileBuf")
	self.offset = new_offset

	fn bitcast_offset_f32(inout self, size: Int) raises -> BufferPtrFloat32:
	let ret = self.data.offset(self.offset).bitcast[DType.float32]()
	self.move_offset(size * sizeof[DType.float32]())
	return ret

	fn get_offset(self) raises -> Int:
	if self.offset > self.size:
	raise Error("Offset is past the end of the FileBuf")
	if self.offset < 0:
	raise Error("Offset is before the beginning of the FileBuf")
	return self.offset


	fn wrap(token: PointerString) -> PointerString:
	if string_compare(token, str_to_ptr("\\n")) == 0:
	return str_to_ptr("<0x0A>")
	if string_compare(token, str_to_ptr("\\t")) == 0:
	return str_to_ptr("<0x09>")
	if string_compare(token, str_to_ptr("'")) == 0:
	return str_to_ptr("<0x27>")
	elif string_compare(token, str_to_ptr('"')) == 0:
	return str_to_ptr("<0x22>")
	return token


	struct Tokenizer:
	var vocab: PointerStrings
	var vocab_scores: BufferPtrFloat32
	var max_token_length: Int
	var vocab_size: Int
	var sorted_vocab: PointerStrings
	var sorted_indices: DynamicVector[Int]

	fn __init__(inout self, vocab_size: Int, inout buf: FileBuf) raises -> None:
	self.vocab_size = vocab_size
	self.max_token_length = read_val_int(buf)
	self.vocab_scores = BufferPtrFloat32.alloc(self.vocab_size)
	self.vocab = PointerStrings.alloc(self.vocab_size)
	# lazy load sorted vocab
	self.sorted_vocab = PointerStrings.alloc(0)
	self.sorted_indices = DynamicVector[Int](0)

	# read vocab_scores & vocab values (tokens)
	for i in range(0, self.vocab_size):
	self.vocab_scores.store(i, read_val_float32(buf))
	let slen = read_val_int(buf)
	self.vocab.store(i, read_val_str(buf, slen))

	return None

	# sort vocab by string_compare
	fn sort(inout self) -> None:
	if len(self.sorted_indices) < self.vocab_size:
	self.sorted_indices = DynamicVector[Int](self.vocab_size)
	self.sorted_vocab = PointerStrings.alloc(self.vocab_size)
	for ii in range(self.vocab_size):
	self.sorted_vocab.store(ii, self.vocab[ii])
	self.sorted_indices.push_back(ii)

	let n = self.vocab_size
	quicksort(self.sorted_vocab, self.sorted_indices, 0, n - 1)
	return None

	# Binary search that returns -1 if string is not found
	fn find(inout self, token_o: PointerString) -> Int:
	let token = wrap(token_o)
	let n = self.vocab_size
	if len(self.sorted_indices) < n:
	self.sort()
	var left = 0
	var right = n - 1
	while left <= right:
	let mid = left + (right - left) // 2
	let comparison = string_compare(self.sorted_vocab[mid], token)
	if comparison == 0:
	return self.sorted_indices[mid]
	if comparison < 0:
	left = mid + 1
	else:
	right = mid - 1
	return -1


	struct Config:
	var dim: Int
	var kv_dim: Int
	var hidden_dim: Int
	var n_layers: Int
	var n_heads: Int
	var n_kv_heads: Int
	var kv_mul: Int
	var vocab_size: Int
	var seq_len: Int
	var head_size: Int

	fn __init__(inout self):
	self.dim = 0
	self.hidden_dim = 0
	self.n_layers = 0
	self.n_heads = 0
	self.n_kv_heads = 0
	self.vocab_size = 0
	self.seq_len = 0
	self.kv_dim = 0
	self.kv_mul = 0
	self.head_size = 0


	struct RunState:
	var x: TensorF32 # activation at current time stamp (dim,)
	var xb: TensorF32 # same, but inside a residual branch (dim,)
	var xb2: TensorF32 # an additional buffer just for convenience (dim,)
	var hb: TensorF32 # buffer for hidden dimension in the ffn (hidden_dim,)
	var hb2: TensorF32 # buffer for hidden dimension in the ffn (hidden_dim,)
	var q: TensorF32 # query (dim,)
	var k: TensorSlice # key (kv_dim,)
	var v: TensorSlice # value (kv_dim,)
	var att: TensorF32 # buffer for scores/attention values (n_heads, seq_len)
	var logits: TensorF32 # output logits
	var key_cache: TensorF32 # (layer, seq_len, dim)
	var value_cache: TensorF32 # (layer, seq_len, dim)


	fn __init__(inout self, config: Config) raises:
	self.x = TensorF32(config.dim)
	self.xb = TensorF32(config.dim)
	self.xb2 = TensorF32(config.dim)
	self.hb = TensorF32(config.hidden_dim)
	self.hb2 = TensorF32(config.hidden_dim)
	self.q = TensorF32(config.dim)
	self.att = TensorF32(config.n_heads, config.seq_len)
	self.logits = TensorF32(config.vocab_size)
	self.key_cache = TensorF32(config.n_layers, config.seq_len, config.kv_dim)
	self.value_cache = TensorF32(config.n_layers, config.seq_len, config.kv_dim)
	# So their updates flow to the caches, k and v are slices with shared memory.
	# Initialize with placeholders. The real tensors reference layer and position during forward pass.
	self.k = TensorSlice(TensorF32(TensorShape(1, config.kv_dim)), 1)
	self.v = TensorSlice(TensorF32(TensorShape(1, config.kv_dim)), 1)



	struct TransformerWeights:
	var token_embedding_table: TensorF32
	var freq_cis_real: TensorF32
	var freq_cis_imag: TensorF32
	var rms_att_weight: TensorF32
	var wq: TensorF32
	var wk: TensorF32
	var wv: TensorF32
	var wo: TensorF32
	var rms_ffn_weight: TensorF32
	var w1: TensorF32
	var w3: TensorF32
	var w2: TensorF32
	var rms_final_weight: TensorF32
	var wcls: TensorF32

	fn __init__(
	inout self, config: Config, shared_weights: Int, inout buf: FileBuf
	) raises:
	fn load_weights(inout buf: FileBuf, *dims: Int) raises -> TensorF32:
	# Ensure returned Tensor doesn't share a pointer with FileBuf
	let shape = TensorShape(dims)
	let result_data = BufferPtrFloat32.alloc(shape.num_elements())
	memcpy(
	result_data,
	buf.bitcast_offset_f32(shape.num_elements()),
	shape.num_elements(),
	)
	return TensorF32(result_data, shape)

	self.token_embedding_table = load_weights(buf, config.vocab_size, config.dim)
	self.rms_att_weight = load_weights(buf, config.n_layers, config.dim)
	self.wq = load_weights(buf, config.n_layers, config.dim, config.dim)
	self.wk = load_weights(buf, config.n_layers, config.kv_dim, config.dim)
	self.wv = load_weights(buf, config.n_layers, config.kv_dim, config.dim)
	self.wo = load_weights(buf, config.n_layers, config.dim, config.dim)
	self.rms_ffn_weight = load_weights(buf, config.n_layers, config.dim)
	self.w1 = load_weights(buf, config.n_layers, config.hidden_dim, config.dim)
	self.w2 = load_weights(buf, config.n_layers, config.dim, config.hidden_dim)
	self.w3 = load_weights(buf, config.n_layers, config.hidden_dim, config.dim)
	self.rms_final_weight = load_weights(buf, config.dim)
	# maybe need modifying for different model
	# config.head_size // 2 for stories and tinyllama-1.1
	self.freq_cis_real = load_weights(buf, config.seq_len, config.head_size // 2)
	self.freq_cis_imag = load_weights(buf, config.seq_len, config.head_size // 2)
	if shared_weights:
	self.wcls = self.token_embedding_table
	else:
	self.wcls = load_weights(buf, config.vocab_size, config.dim)


	fn read_file(file_name: String, inout buf: FileBuf) raises:
	let _os = Python.import_module("os")
	let ff_size = _os.path.getsize(file_name)
	let cp_size = string.atol(ff_size.to_string())
	let cp_buf: BufferPtrType = BufferPtrType.alloc(cp_size)
	# set window buffer to read binary data from file
	let f = File(file_name)
	var reader = BufReader[4096](f ^)
	var bytes_read = 1
	var offset = 0

	while bytes_read > 0:
	let buf = Buffer[4096, DType.uint8](cp_buf.offset(offset))
	bytes_read = reader.read(buf)
	offset += bytes_read
	reader.do_nothing() # keeps lifetimes working
	buf.data = cp_buf
	buf.size = cp_size
	buf.offset = 0
	return None


	fn config_init(inout config: Config, inout buf: FileBuf) raises:
	config.dim = read_val_int(buf)
	config.hidden_dim = read_val_int(buf)
	config.n_layers = read_val_int(buf)
	config.n_heads = read_val_int(buf)
	config.n_kv_heads = read_val_int(buf)
	config.vocab_size = read_val_int(buf)
	config.seq_len = read_val_int(buf)
	config.head_size = config.dim // config.n_heads
	config.kv_dim = (config.n_kv_heads * config.dim) // config.n_heads
	config.kv_mul = config.n_heads // config.n_kv_heads
	return None


	@always_inline
	fn accum(inout a: TensorF32, b: TensorF32) -> None:
	let size = a.dim(0)

	@parameter
	fn _acc[_nelts: Int](j: Int):
	a.simd_store[_nelts](j, a.simd_load[_nelts](j) + b.simd_load[_nelts](j))

	vectorize[nelts, _acc](size)


	@always_inline
	fn rmsnorm(
	inout o: BufferPtrFloat32, x: BufferPtrFloat32, weight: BufferPtrFloat32, size: Int
	) -> None:
	# Calculate sum of squares
	var tmp = SIMD[DType.float32, nelts](0)

	@parameter
	fn _sum2[_nelts: Int](j: Int):
	if _nelts < nelts:
	tmp[0] += (x.offset(j).simd_load[_nelts](0) ** 2).reduce_add()
	else:
	tmp += x.offset(j).simd_load[nelts](0) ** 2

	vectorize[nelts, _sum2](size)

	var ss: Float32 = tmp.reduce_add()
	ss = ss / size + 1e-5
	ss = 1.0 / math.sqrt(ss)

	# Normalize and scale
	@parameter
	fn _norm[_nelts: Int](j: Int):
	let val = weight.simd_load[_nelts](j) * ss * x.simd_load[_nelts](j)
	o.offset(j).simd_store[_nelts](0, val)

	vectorize[nelts, _norm](size)


	@always_inline
	fn rmsnorm(inout o: TensorF32, x: TensorF32, weight: TensorF32):
	rmsnorm(o._ptr, x.data(), weight.data(), weight.dim(weight.rank() - 1))


	@always_inline
	fn rmsnorm(inout o: TensorF32, x: TensorF32, weight: TensorSlice):
	rmsnorm(o._ptr, x.data(), weight.data(), weight.dim(weight.rank() - 1))


	@always_inline
	fn softmax(inout x: TensorF32) -> None:
	softmax(x, 0, x.dim(0))


	@always_inline
	fn softmax(inout x: TensorF32, start: Int, end: Int):
	var max_val: Float32 = -1e9

	@parameter
	fn _max[_nelts: Int](ii: Int):
	let val = x.simd_load[_nelts](start + ii).reduce_max()
	if val > max_val:
	max_val = val

	vectorize[nelts, _max](end - start)

	var ssum: Float32 = 0.0

	@parameter
	fn _exp[_nelts: Int](ii: Int):
	x.simd_store[_nelts](
	start + ii, math.exp(x.simd_load[_nelts](start + ii) - max_val)
	)
	ssum += x.simd_load[_nelts](start + ii).reduce_add()

	vectorize[nelts, _exp](end - start)

	@parameter
	fn _norm[_nelts: Int](ii: Int):
	x.simd_store[_nelts](start + ii, x.simd_load[_nelts](start + ii) / ssum)

	vectorize[nelts, _norm](end - start)


	@always_inline
	fn matmul_parallelized(C: BufferPtrFloat32,A: BufferPtrFloat32,B: BufferPtrFloat32,rows: Int,cols: Int,):
	@parameter
	fn compute_row(i: Int):
	var tmp = SIMD[DType.float32, nelts](0)

	@parameter
	fn dot[_nelts: Int](j: Int):
	if _nelts < nelts: # take care of tail array elements with length < nelts
	tmp[0] += (
	A.simd_load[_nelts](j) * B.simd_load[_nelts](i * cols + j)
	).reduce_add()
	else:
	tmp += A.simd_load[nelts](j) * B.simd_load[nelts](i * cols + j)

	vectorize[nelts, dot](cols)
	C.store(i, tmp.reduce_add())


	parallelize[compute_row](rows, workers)





	@always_inline
	fn matmul(C: TensorF32, A: TensorF32, B: TensorF32) raises:
	# B (d,n) @ A (n,) -> C (d,)
	matmul_dimension_checks(A.shape(), B.shape())
	matmul_parallelized(C.data(), A.data(), B.data(), B.dim(0), B.dim(1))


	@always_inline
	fn matmul(C: TensorF32, A: TensorF32, B: TensorSlice) raises:
	# B (d,n) @ A (n,) -> C (d,)
	matmul_dimension_checks(A.shape(), B.shape())
	matmul_parallelized(C.data(), A.data(), B.data(), B.dim(0), B.dim(1))


	@always_inline
	fn matmul(C: TensorSlice, A: TensorF32, B: TensorSlice) raises:
	# B (d,n) @ A (n,) -> C (d,)
	matmul_dimension_checks(A.shape(), B.shape())
	matmul_parallelized(C.data(), A.data(), B.data(), B.dim(0), B.dim(1))


	fn matmul_dimension_checks(a: TensorShape, b: TensorShape) raises:
	if a[0] != b[1]:
	raise Error(
	"matmul dimension mismatch. A rows (dim 0) not equal to B columns (dim 1)"
	)
	if b.rank() != 2:
	raise Error("matmul expects B to be a 2D matrix")


	# Apply RoPE rotation to the q and k vectors for each head
	# rotate odd and even dim
	@always_inline
	fn rope_rotation_llama(
	inout state: RunState,
	freq_cis_real_row: TensorSlice,
	freq_cis_imag_row: TensorSlice,
	config: Config,
	) -> None:
	# stories model, llama2
	let head_size = config.head_size
	@parameter
	fn head_loop(i:Int):
	# Simple vectorization with (head_size // 2) steps gave junk transformer output.
	# Maybe because the nelt ranges end up overlapping between the steps.
	for j in range(0, config.head_size, 2):
	let fcr = freq_cis_real_row[j // 2]
	let fci = freq_cis_imag_row[j // 2]
	let q0 = state.q[i * head_size + j]
	let q1 = state.q[i * head_size + j + 1]
	state.q[i * head_size + j] = q0 * fcr - q1 * fci
	state.q[i * head_size + j + 1] = q0 * fci + q1 * fcr
	if i < config.n_kv_heads:
	let k0 = state.k[i * head_size + j]
	let k1 = state.k[i * head_size + j + 1]
	state.k[i * head_size + j] = k0 * fcr - k1 * fci
	state.k[i * head_size + j + 1] = k0 * fci + k1 * fcr
	parallelize[head_loop](config.n_heads, workers)



	@always_inline
	fn transformer(
	token: Int,
	pos: Int,
	config: Config,
	inout state: RunState,
	weights: TransformerWeights,
	) raises -> None:
	# A few convenience variables
	let dim = config.dim
	let hidden_dim = config.hidden_dim
	let head_size = config.head_size
	let kv_dim = config.kv_dim
	let kv_mul = config.kv_mul

	# Copy the token embedding into x
	let content_row = weights.token_embedding_table.data().offset(token * dim)
	memcpy[DType.float32](state.x.data(), content_row, dim)

	# Pluck out the "pos" row of freq_cis_real and freq_cis_imag
	let freq_cis_real_row = TensorSlice(weights.freq_cis_real, pos)
	let freq_cis_imag_row = TensorSlice(weights.freq_cis_imag, pos)

	# Forward all the layers
	for l in range(config.n_layers):
	# Attention rmsnorm
	rmsnorm(state.xb, state.x, TensorSlice(weights.rms_att_weight, l))
	# QKV matmuls for this position
	matmul(state.q, state.xb, TensorSlice(weights.wq, l))

	let loff = l * config.seq_len * config.kv_dim
	state.k = TensorSlice(state.key_cache, l, pos)
	matmul(state.k, state.xb, TensorSlice(weights.wk, l))

	state.v = TensorSlice(state.value_cache, l, pos)
	matmul(state.v, state.xb, TensorSlice(weights.wv, l))

	# Apply RoPE rotation to the q and k vectors for each head
	rope_rotation_llama(state, freq_cis_real_row, freq_cis_imag_row, config)

	memset_zero(state.xb.data(), state.xb.num_elements())

	# Multihead attention. Iterate over all heads in parallel.
	@parameter
	fn loop_over_heads(h:Int):
	# Get the query vector for this head
	let q_offset = h * head_size

	# Index of attention scores for this head
	let att_offset = h * config.seq_len

	# Iterate over all timesteps, including the current one
	for t in range(pos + 1):
	# Starting index of the key vector for this head and at this timestep
	let k_offset = loff + t * kv_dim + (h // kv_mul) * head_size
	# Calculate the attention score as the dot product of q and k
	var score: Float32 = 0.0

	@parameter
	fn score_fn[_nelts: Int](i: Int):
	score += (
	state.q.simd_load[_nelts](q_offset + i)
	* state.key_cache.simd_load[_nelts](k_offset + i)
	).reduce_add()

	vectorize[nelts, score_fn](head_size)
	score /= math.sqrt[DType.float32, 1](head_size)

	# Save the score to the attention buffer
	state.att[att_offset + t] = score

	# Softmax the scores to get attention weights, from 0..pos inclusively
	softmax(state.att, att_offset, att_offset + pos + 1)
	# Weighted sum of the values, store back into xb
	let xb_offset = h * head_size
	for t in range(pos + 1):
	# Starting index of the value vector for this head and at this timestep
	let v_offset = loff + t * kv_dim + (h // kv_mul) * head_size

	# Get the attention weight for this timestep
	let a = state.att[att_offset + t]
	# Accumulate the weighted value into xb

	@parameter
	fn xb_accumulate[_nelts: Int](i: Int):
	let xbi = state.xb.simd_load[_nelts](
	xb_offset + i
	) + a * state.value_cache.simd_load[_nelts](v_offset + i)
	state.xb.simd_store[_nelts](xb_offset + i, xbi)

	vectorize[nelts, xb_accumulate](head_size)

	parallelize[loop_over_heads](config.n_heads, workers)
	# Final matrix multiplication to get the output of the attention
	matmul(state.xb2, state.xb, TensorSlice(weights.wo, l))
	# Residual connection back into x
	accum(state.x, state.xb2)
	# FFN rmsnorm
	rmsnorm(state.xb, state.x, TensorSlice(weights.rms_ffn_weight, l))

	# Calculate self.w1(x) and self.w3(x) for FFN
	matmul(state.hb, state.xb, TensorSlice(weights.w1, l))

	matmul(state.hb2, state.xb, TensorSlice(weights.w3, l))

	@parameter
	fn silu[_nelts: Int](i: Int):
	let initial_hb = state.hb.simd_load[_nelts](i)
	# Apply SiLU activation function (silu(x) = x * sigmoid(x))
	let hbi = initial_hb * (1.0 / (1.0 + math.exp(-initial_hb)))
	# Elementwise multiply with w3(x)
	state.hb.simd_store[_nelts](i, hbi * state.hb2.simd_load[_nelts](i))

	vectorize[nelts, silu](hidden_dim)
	# Final matrix multiplication to get the output of the FFN
	matmul(state.xb, state.hb, TensorSlice(weights.w2, l))

	# Residual connection
	accum(state.x, state.xb)

	# Final rmsnorm
	rmsnorm(state.x, state.x, weights.rms_final_weight)

	# Classifier into logits
	matmul(state.logits, state.x, weights.wcls)


	fn argmax(v: TensorF32) -> Int:
	# return argmax of v
	var max_i: Int = 0
	var max_p: Float32 = v[0]
	for i in range(v.dim(0)):
	if v[i] > max_p:
	max_i = i
	max_p = v[i]
	return max_i


	fn sample(probabilities: TensorF32) -> Int:
	let n = probabilities.dim(0)
	# Sample index from probabilities, they must sum to 1
	# get random value within (min, max) float32 range
	let r = DTypePointer[DType.float32].alloc(1)
	rand[DType.float32](r, 1)
	var cdf: Float32 = 0.0
	for i in range(n):
	cdf += probabilities[i]
	if r.load(0) < cdf:
	return i
	return n - 1 # In case of rounding errors


	fn bpe_encode(inout tokens: DynamicVector[Int], text: String, inout tok: Tokenizer):
	for pos in range(len(text)):
	let char = str_to_ptr(text[pos])
	let id = tok.find(char)

	if id == -1:
	print("Not a good prompt token at pos ", pos)
	return
	tokens.push_back(id)

	while True:
	var best_score = Float32(-1e10)
	var best_id = -1
	var best_idx = -1

	for i in range(len(tokens) - 1):
	# Check if we can merge the pair (tokens[i], tokens[i+1])
	let str = str_concat(tok.vocab[tokens[i]], tok.vocab[tokens[i + 1]])
	let id = tok.find(str)
	if id != -1 and tok.vocab_scores.load(id) > best_score:
	best_score = tok.vocab_scores.load(id)
	best_id = id
	best_idx = i

	if best_idx == -1:
	# We couldn't find any more pairs to merge, so we're done
	break

	# Merge the consecutive pair (best_idx, best_idx+1) into new token best_id
	tokens[best_idx] = best_id
	# Delete token at position best_idx+1, shift the entire sequence back 1
	var _tokens = DynamicVector[Int]()
	for i in range(0, best_idx + 1):
	_tokens.push_back(tokens[i])
	for i in range(best_idx + 2, len(tokens)):
	_tokens.push_back(tokens[i])
	tokens = _tokens


	fn str2num(d: Int) -> Int:
	# covert Hex to decimal
	if d >= ord("A"):
	return d - ord("A") + 10
	return d - ord("0")


	fn print_str(s: PointerString):
	# print raw byte like <0x0A>
	if (s[1].to_int() == ord("0")) and (s[2].to_int() == ord("x")):
	let d1: Int = s[3].to_int()
	let d2: Int = s[4].to_int()
	print_no_newline(chr(str2num(d1) * 16 + str2num(d2)))
	return
	# print all chars till null character
	var p: Int = 0
	while s[p].to_int() != 0:
	print_no_newline(chr(s[p].to_int()))
	p += 1


	fn time_in_ms() -> Int:
	# Returns time in milliseconds for benchmarking the model speed
	return time.now() // 1_000_000


	fn print_usage():
	print("Usage: mojo llama2.mojo <checkpoint> [options]")
	print(
	'Example: mojo llama2.mojo stories15M.bin -s 99 -n 256 -t 0.5 -i "Llama is an'
	' animal"'
	)
	print("Options:")
	print(" -s <int> random seed, default time.now()")
	print(" -t <float> temperature in [0,1.0], default 1.0")
	print(" -n <int> number of steps to run for, default 256. 0 = max_seq_len")
	print(" -i <string> input prompt")
	print(" -z tokenizer path")
	print(" -j number of workers to use, default num_cores()")


	fn main() raises:
	workers = num_cores()
	var tokenizer = StringRef("tokenizer.bin")
	var checkpoint = StringRef("stories15M.bin")
	var temperature = 0.9
	var steps = 256
	var prompt = String("")
	var rng_seed: Int = time.now()

	@parameter
	fn argparse() raises -> Int:
	let args = argv()
	if len(args) < 2:
	return 0
	checkpoint = args[1]
	for i in range(2, len(args), 2):
	if args[i] == "-p":
	print("Option not supported: ", args[i])
	if args[i] == "-n":
	steps = atol(args[i + 1])
	if args[i] == "-z":
	tokenizer = args[i + 1]
	if args[i] == "-s":
	rng_seed = atol(args[i + 1])
	if args[i] == "-i":
	prompt = args[i + 1]
	if args[i] == "-j":
	workers = atol(args[i + 1])
	if args[i] == "-t":
	let val = args[i + 1]
	temperature = 0.0
	# hacky parse float, keep only 1 digit
	for c in range(0, len(val)):
	if val[c] == ".":
	temperature += atol(val[c + 1]) * 0.1
	break
	else:
	temperature = atol(val[c])
	if temperature < -1e9 or temperature > (1 + 1e9):
	print("Wrong temperature value", temperature)
	return 0
	return 1

	let res = argparse()
	if res == 0:
	print_usage()
	return

	print("num parallel workers:", workers, " SIMD width:", nelts)
	random.seed(rng_seed)
	var fbuf: FileBuf = FileBuf()
	var tbuf: FileBuf = FileBuf()
	var config: Config = Config()

	read_file(checkpoint, fbuf)
	config_init(config, fbuf)

	# negative vocab size is hacky way of signaling unshared weights. bit yikes.
	let shared_weights = 1 if config.vocab_size > 0 else 0
	config.vocab_size = (
	-config.vocab_size if config.vocab_size < 0 else config.vocab_size
	)

	let weights: TransformerWeights = TransformerWeights(config, shared_weights, fbuf)

	if steps <= 0 or steps > config.seq_len:
	steps = config.seq_len

	# Read in the tokenizer.bin file
	read_file(tokenizer, tbuf)
	var tok = Tokenizer(config.vocab_size, tbuf)

	# print the layers number and vocab size
	print("checkpoint size: ", fbuf.size, "[", fbuf.size // 1024 // 1024, "MB ]",
	"\| n layers:", config.n_layers, "\| vocab size:", tok.vocab_size)

	# Create and initialize the application RunState
	var state = RunState(config)

	# Process the prompt, if any
	var prompt_tokens = DynamicVector[Int]()

	if prompt:
	bpe_encode(prompt_tokens, prompt, tok)

	# Start the main loop
	var start = 0 # Used to time our code, only initialized after the first iteration
	var next_token = 0 # Will store the next token in the sequence
	# Initialize with token 1 (=BOS), as done in Llama-2 sentencepiece tokenizer
	var token = 1

	# Position in the sequence
	var pos = 0
	while pos < steps:
	# Forward the transformer to get logits for the next token
	transformer(token, pos, config, state, weights)

	if pos < len(prompt_tokens):
	next_token = prompt_tokens[pos]
	else:
	# Sample the next token
	if temperature == 0.0:
	# Greedy argmax sampling: take the token with the highest probability
	next_token = argmax(state.logits)
	else:
	# Apply the temperature to the logits
	for q in range(config.vocab_size):
	state.logits[q] = state.logits[q] / temperature

	# Apply softmax to the logits to get the probabilities for the next token
	softmax(state.logits)
	# Sample from this distribution to get the next token
	next_token = sample(state.logits)

	# Finish generating when EOS, BOS appear
	if next_token == 1 or next_token == 2:
	break
	var token_str: PointerString = tok.vocab[next_token]
	if token == 1 and token_str[0] == ord(" "):
	token_str = token_str.offset(1)

	print_str(token_str)

	# Advance forward
	token = next_token
	pos += 1

	if start == 0:
	start = time_in_ms()

	let end = time_in_ms()
	print("\nachieved tok/s: ", (pos - 1) / (end - start) * 1000)