Spaces:

radames
/

Gradio-llama2.mojo

Runtime error

App Files Files Community

Gradio-llama2.mojo / llama2.mojo

radames

new ui

264c8c8 11 months ago

raw

history blame

28.4 kB

	from algorithm import sum
	from algorithm import vectorize, parallelize
	from builtin import string
	from math import round
	from memory import memset_zero, memcpy
	from memory.buffer import Buffer
	from memory.unsafe import DTypePointer
	from python import Python
	from random import rand
	from read import BufReader, File
	from runtime.llcl import num_cores, Runtime
	from sys import argv

	# The SIMD vector width.
	from sys.info import simdwidthof
	import math
	import os
	import random
	import time

	alias nelts = (2 * simdwidthof[DType.float32]())

	alias PointerString = Pointer[UInt8]
	alias BufferPtrType = DTypePointer[DType.uint8]
	alias BufferPtrFloat32 = DTypePointer[DType.float32]
	alias PointerStrings = Pointer[PointerString]


	struct Matrix:
	var data: BufferPtrFloat32
	var rows: Int
	var cols: Int
	var layers: Int
	var allocated: Int

	fn __init__(inout self, rows: Int, cols: Int):
	self.data = BufferPtrFloat32.alloc(0)
	self.rows = rows
	self.cols = cols
	self.layers = 1
	self.allocated = 0

	fn __init__(inout self, cols: Int):
	self.data = BufferPtrFloat32.alloc(0)
	self.rows = 1
	self.layers = 1
	self.cols = cols
	self.allocated = 0

	fn __init__(inout self, layers: Int, rows: Int, cols: Int):
	self.__init__(rows, cols)
	self.layers = layers

	fn __del__(owned self):
	if self.allocated == 1:
	self.data.free()

	@always_inline
	fn alloc(inout self, fill: Int = 0):
	self.data = BufferPtrFloat32.alloc(self.size())
	self.allocated = 1
	if fill == 1:
	self.zero()

	@always_inline
	fn alloc_zero(inout self):
	self.alloc(1)

	@always_inline
	fn zero(inout self):
	memset_zero(self.data, self.size())

	@always_inline
	fn set_buf_ptr(inout self, ptr: BufferPtrFloat32):
	self.data = ptr

	# set buf ptr with redefined rows, colss
	fn set_buf_ptr(inout self, ptr: BufferPtrFloat32, rows: Int, cols: Int):
	self.data = ptr
	self.rows = rows
	self.cols = cols

	@always_inline
	fn size(inout self) -> Int:
	return self.cols * self.rows * self.layers

	@always_inline
	fn __getitem__(self, y: Int, x: Int) -> Float32:
	return self.load[1](y, x)

	@always_inline
	fn __getitem__(self, x: Int) -> Float32:
	return self.load[1](0, x)

	@always_inline
	fn load[nelts: Int](self, y: Int, x: Int) -> SIMD[DType.float32, nelts]:
	return self.data.simd_load[nelts](y * self.cols + x)

	@always_inline
	fn __setitem__(self, y: Int, x: Int, val: Float32):
	return self.store[1](y, x, val)

	@always_inline
	fn __setitem__(self, x: Int, val: Float32):
	return self.store[1](0, x, val)

	@always_inline
	fn store[nelts: Int](self, y: Int, x: Int, val: SIMD[DType.float32, nelts]):
	self.data.simd_store[nelts](y * self.cols + x, val)

	@always_inline
	fn load[nelts: Int](self, x: Int) -> SIMD[DType.float32, nelts]:
	return self.data.simd_load[nelts](x)

	@always_inline
	fn store[nelts: Int](self, x: Int, val: SIMD[DType.float32, nelts]):
	self.data.simd_store[nelts](x, val)

	@always_inline
	fn __getitem__(self, z: Int, y: Int, x: Int) -> Float32:
	return self.load[1](z, y, x)

	@always_inline
	fn load[nelts: Int](self, z: Int, y: Int, x: Int) -> SIMD[DType.float32, nelts]:
	return self.data.simd_load[nelts](z * self.layers + y * self.cols + x)

	@always_inline
	fn __setitem__(self, z: Int, y: Int, x: Int, val: Float32):
	return self.store[1](z, y, x, val)

	@always_inline
	fn store[nelts: Int](self, z: Int, y: Int, x: Int, val: SIMD[DType.float32, nelts]):
	self.data.simd_store[nelts](z * self.layers + y * self.cols + x, val)


	fn read_val_int(inout buf: FileBuf) -> Int:
	# DTypePointer[DType.ui8](buf.data).bitcast[DType.ui8]()
	let data = buf.data.offset(buf.offset).bitcast[DType.uint32]()
	let result = data.simd_load[1](0)
	buf.offset += 4
	return result.to_int()


	fn read_val_float32(inout buf: FileBuf) -> Float32:
	# DTypePointer[DType.ui8](buf.data).bitcast[DType.ui8]()
	let val = buf.data.offset(buf.offset).bitcast[DType.float32]().simd_load[1](0)
	buf.offset += 4
	return val


	fn read_val_str(inout buf: FileBuf, slen: Int) -> PointerString:
	let str = PointerString.alloc(slen + 1)
	for i in range(slen):
	str.store(i, buf.data.simd_load[1](buf.offset))
	buf.offset += 1
	str.store(slen, 0)

	return str


	# not optimal concat
	fn str_concat(s1: PointerString, s2: PointerString) -> PointerString:
	var l1 = 0
	var l2 = 0

	while s1[l1] != 0:
	l1 += 1
	while s2[l2] != 0:
	l2 += 1

	let str = PointerString.alloc(l1 + l2)
	memcpy[UInt8](str, s1, l1)
	memcpy[UInt8](str.offset(l1), s2, l2)
	str.store(l1 + l2, 0)
	return str


	fn str_to_ptr(s: String) -> PointerString:
	let ret = PointerString.alloc(len(s) + 1)
	for i in range(len(s)):
	ret.store(i, ord(s[i]))
	ret.store(len(s), 0)
	return ret


	struct FileBuf:
	var data: BufferPtrType
	var offset: Int
	var size: Int

	fn __init__(inout self):
	self.data = BufferPtrType()
	self.offset = 0
	self.size = 0

	fn move_offset(inout self, size: Int):
	self.offset += size

	fn bitcast_offset_float32(inout self, size: Int) -> BufferPtrFloat32:
	let ret = self.data.offset(self.offset).bitcast[DType.float32]()
	self.offset += size * sizeof[DType.float32]()
	return ret


	struct Tokenizer:
	var vocab: PointerStrings
	var vocab_scores: BufferPtrFloat32
	var max_token_length: Int
	var vocab_size: Int

	fn __init__(inout self, vocab_size: Int):
	self.vocab_size = vocab_size
	self.vocab = PointerStrings.alloc(vocab_size)
	self.vocab_scores = BufferPtrFloat32.alloc(vocab_size)
	self.max_token_length = 0


	struct Config:
	var dim: Int
	var hidden_dim: Int
	var n_layers: Int
	var n_heads: Int
	var n_kv_heads: Int
	var vocab_size: Int
	var seq_len: Int

	fn __init__(inout self):
	self.dim = 0
	self.hidden_dim = 0
	self.n_layers = 0
	self.n_heads = 0
	self.n_kv_heads = 0
	self.vocab_size = 0
	self.seq_len = 0


	struct RunState:
	var x: Matrix # activation at current time stamp (dim,)
	var xb: Matrix # same, but inside a residual branch (dim,)
	var xb2: Matrix # an additional buffer just for convenience (dim,)
	var hb: Matrix # buffer for hidden dimension in the ffn (hidden_dim,)
	var hb2: Matrix # buffer for hidden dimension in the ffn (hidden_dim,)
	var q: Matrix # query (dim,)
	var k: Matrix # key (dim,)
	var v: Matrix # value (dim,)
	var att: Matrix # buffer for scores/attention values (n_heads, seq_len)
	var logits: Matrix # output logits
	var key_cache: Matrix # (layer, seq_len, dim)
	var value_cache: Matrix # (layer, seq_len, dim)
	var rt: Runtime

	fn __init__(inout self, config: Config):
	self.x = Matrix(config.dim)
	self.x.alloc_zero()
	self.xb = Matrix(config.dim)
	self.xb.alloc_zero()
	self.xb2 = Matrix(config.dim)
	self.xb2.alloc_zero()
	self.hb = Matrix(config.hidden_dim)
	self.hb.alloc_zero()
	self.hb2 = Matrix(config.hidden_dim)
	self.hb2.alloc_zero()
	self.q = Matrix(config.dim)
	self.q.alloc_zero()
	self.k = Matrix(config.dim)
	self.k.alloc_zero()
	self.v = Matrix(config.dim)
	self.v.alloc_zero()
	self.att = Matrix(config.n_heads, config.seq_len)
	self.att.alloc_zero()
	self.logits = Matrix(config.vocab_size)
	self.logits.alloc_zero()
	self.key_cache = Matrix(config.n_layers, config.seq_len, config.dim)
	self.key_cache.alloc_zero()
	self.value_cache = Matrix(config.n_layers, config.seq_len, config.dim)
	self.value_cache.alloc_zero()
	self.rt = Runtime(num_cores() // 2)


	struct TransformerWeights:
	var token_embedding_table: Matrix
	var freq_cis_real: Matrix
	var freq_cis_imag: Matrix
	var rms_att_weight: Matrix
	var wq: Matrix
	var wk: Matrix
	var wv: Matrix
	var wo: Matrix
	var rms_ffn_weight: Matrix
	var w1: Matrix
	var w3: Matrix
	var w2: Matrix
	var rms_final_weight: Matrix
	var wcls: Matrix

	fn __init__(inout self, config: Config, shared_weights: Int, inout buf: FileBuf):
	self.token_embedding_table = Matrix(config.vocab_size, config.dim)
	# set buf ptr to buf data from file
	self.token_embedding_table.set_buf_ptr(
	buf.bitcast_offset_float32(self.token_embedding_table.size())
	)
	self.rms_att_weight = Matrix(config.n_layers, config.dim)
	self.rms_att_weight.set_buf_ptr(
	buf.bitcast_offset_float32(self.rms_att_weight.size())
	)
	self.wq = Matrix(config.n_layers, config.dim, config.dim)
	self.wq.set_buf_ptr(buf.bitcast_offset_float32(self.wq.size()))
	self.wk = Matrix(config.n_layers, config.dim, config.dim)
	self.wk.set_buf_ptr(buf.bitcast_offset_float32(self.wk.size()))
	self.wv = Matrix(config.n_layers, config.dim, config.dim)
	self.wv.set_buf_ptr(buf.bitcast_offset_float32(self.wv.size()))
	self.wo = Matrix(config.n_layers, config.dim, config.dim)
	self.wo.set_buf_ptr(buf.bitcast_offset_float32(self.wo.size()))
	self.rms_ffn_weight = Matrix(config.n_layers, config.dim)
	self.rms_ffn_weight.set_buf_ptr(
	buf.bitcast_offset_float32(self.rms_ffn_weight.size())
	)
	self.w1 = Matrix(config.n_layers, config.dim, config.hidden_dim)
	self.w1.set_buf_ptr(buf.bitcast_offset_float32(self.w1.size()))
	self.w2 = Matrix(config.n_layers, config.dim, config.hidden_dim)
	self.w2.set_buf_ptr(buf.bitcast_offset_float32(self.w2.size()))
	self.w3 = Matrix(config.n_layers, config.dim, config.hidden_dim)
	self.w3.set_buf_ptr(buf.bitcast_offset_float32(self.w3.size()))
	self.rms_final_weight = Matrix(config.dim)
	self.rms_final_weight.set_buf_ptr(
	buf.bitcast_offset_float32(self.rms_final_weight.size())
	)
	self.freq_cis_real = Matrix(config.seq_len, (config.dim // config.n_heads) // 2)
	self.freq_cis_real.set_buf_ptr(
	buf.bitcast_offset_float32(self.freq_cis_real.size())
	)
	self.freq_cis_imag = Matrix(config.seq_len, (config.dim // config.n_heads) // 2)
	self.freq_cis_imag.set_buf_ptr(
	buf.bitcast_offset_float32(self.freq_cis_imag.size())
	)
	self.wcls = Matrix(
	config.vocab_size, config.dim
	) # if shared_weights else rest_floats
	self.wcls.set_buf_ptr(self.token_embedding_table.data)


	fn read_file(file_name: String, inout buf: FileBuf) raises:
	let _os = Python.import_module("os")
	let ff_size = _os.path.getsize(file_name)
	let cp_size = string.atol(ff_size.to_string())
	let cp_buf: BufferPtrType = BufferPtrType.alloc(cp_size)
	# set window buffer to read binary data from file
	let f = File(file_name)
	var reader = BufReader[4096](f ^)
	var bytes_read = 1
	var offset = 0

	while bytes_read > 0:
	let buf = Buffer[4096, DType.uint8](cp_buf.offset(offset))
	bytes_read = reader.read(buf)
	offset += bytes_read
	reader.do_nothing() # keeps lifetimes working
	buf.data = cp_buf
	buf.size = cp_size
	buf.offset = 0
	return None


	fn config_init(inout config: Config, inout buf: FileBuf) raises:
	config.dim = read_val_int(buf)
	config.hidden_dim = read_val_int(buf)
	config.n_layers = read_val_int(buf)
	config.n_heads = read_val_int(buf)
	config.n_kv_heads = read_val_int(buf)
	config.vocab_size = read_val_int(buf)
	config.seq_len = read_val_int(buf)
	return None


	fn tokenizer_init(inout tok: Tokenizer, inout buf: FileBuf) -> None:
	tok.max_token_length = read_val_int(buf)
	tok.vocab_scores = BufferPtrFloat32.alloc(tok.vocab_size)
	tok.vocab = PointerStrings.alloc(tok.vocab_size)

	# read vocab_scores & vocab values (tokens)
	for i in range(0, tok.vocab_size):
	tok.vocab_scores.simd_store[1](i, read_val_float32(buf))
	let slen = read_val_int(buf)
	tok.vocab.store(i, read_val_str(buf, slen))

	tok.vocab_scores = buf.data.offset(buf.offset).bitcast[DType.float32]()
	buf.offset += tok.vocab_size * 4
	return None


	fn accum(inout a: BufferPtrFloat32, b: BufferPtrFloat32, size: Int) -> None:
	for i in range(size):
	let val = a.offset(i).simd_load[1](0) + b.offset(i).simd_load[1](0)
	a.offset(i).simd_store[1](0, val)


	fn rmsnorm(
	inout o: BufferPtrFloat32, x: BufferPtrFloat32, weight: BufferPtrFloat32, size: Int
	) -> None:
	# Calculate sum of squares
	var ss: Float32 = 0.0
	for i in range(size):
	let xx = x.offset(i).simd_load[1](0) ** 2
	ss += xx
	ss = ss / size + 1e-5
	ss = 1.0 / math.sqrt(ss)
	# Normalize and scale
	for j in range(size):
	let val = weight.offset(j).simd_load[1](0) * (ss * x.offset(j).simd_load[1](0))
	o.offset(j).simd_store[1](0, val)


	fn softmax(inout x: BufferPtrFloat32, size: Int) -> None:
	# Find max value (for numerical stability)
	var max_val: Float32 = x.offset(0).simd_load[1](0)
	for i in range(size):
	let xi = x.offset(i).simd_load[1](0)
	if xi > max_val:
	max_val = xi
	# Exp and sum
	var ssum: Float32 = 0.0
	for i in range(size):
	let xi = x.offset(i).simd_load[1](0)
	x.offset(i).simd_store[1](0, math.exp(xi - max_val))
	ssum += x.offset(i).simd_load[1](0)
	# Normalize
	for i in range(size):
	let xi = x.offset(i).simd_load[1](0)
	x.offset(i).simd_store[1](0, xi / ssum)


	fn matmul_parallelized(C: Matrix, A: Matrix, B: Matrix, rt: Runtime):
	@parameter
	fn compute_row(i: Int):
	var tmp = SIMD[DType.float32, nelts](0)

	@parameter
	fn dot[_nelts: Int](j: Int):
	if _nelts < nelts: # take care of tail array elements with length < nelts
	tmp[0] += (A.load[_nelts](j) * B.load[_nelts](i, j)).reduce_add()
	else:
	tmp += A.load[nelts](j) * B.load[nelts](i, j)

	vectorize[nelts, dot](B.cols)
	C[i] = tmp.reduce_add()

	parallelize[compute_row](rt, B.rows, rt.parallelism_level())


	fn matmul(inout C: Matrix, A: Matrix, B: Matrix, rt: Runtime) -> None:
	# B (d,n) @ A (n,) -> C (d,)
	matmul_parallelized(C, A, B, rt)


	fn transformer(
	token: Int,
	pos: Int,
	config: Config,
	inout state: RunState,
	weights: TransformerWeights,
	) -> None:
	# A few convenience variables
	var x = state.x.data
	let dim = config.dim
	let hidden_dim = config.hidden_dim
	let head_size = dim // config.n_heads

	# tmp matrix for matmul operations
	var tmpw = Matrix(0, 0)

	# Copy the token embedding into x
	let content_row = weights.token_embedding_table.data.offset(token * dim)
	memcpy[DType.float32](x, content_row, config.dim)

	# Pluck out the "pos" row of freq_cis_real and freq_cis_imag
	let freq_cis_real_row = weights.freq_cis_real.data.offset(pos * head_size // 2)
	let freq_cis_imag_row = weights.freq_cis_imag.data.offset(pos * head_size // 2)

	# Forward all the layers
	for l in range(config.n_layers):
	# Attention rmsnorm
	rmsnorm(state.xb.data, x, weights.rms_att_weight.data.offset(l * dim), dim)

	# QKV matmuls for this position
	tmpw.set_buf_ptr(weights.wq.data.offset(l * dim * dim), dim, dim)
	matmul(state.q, state.xb, tmpw, state.rt)

	tmpw.set_buf_ptr(weights.wk.data.offset(l * dim * dim), dim, dim)
	matmul(state.k, state.xb, tmpw, state.rt)

	tmpw.set_buf_ptr(weights.wv.data.offset(l * dim * dim), dim, dim)
	matmul(state.v, state.xb, tmpw, state.rt)

	# Apply RoPE rotation to the q and k vectors for each head
	for h in range(config.n_heads):
	# Get the q and k vectors for this head
	let q = state.q.data.offset(h * head_size)
	let k = state.k.data.offset(h * head_size)

	# Rotate q and k by the freq_cis_real and freq_cis_imag
	for i in range(0, head_size, 2):
	let q0 = q.offset(i).simd_load[1](0)
	let q1 = q.offset(i + 1).simd_load[1](0)
	let k0 = k.offset(i).simd_load[1](0)
	let k1 = k.offset(i + 1).simd_load[1](0)
	let fcr = freq_cis_real_row.offset(i // 2).simd_load[1](0)
	let fci = freq_cis_imag_row.offset(i // 2).simd_load[1](0)
	q.offset(i).simd_store[1](0, q0 * fcr - q1 * fci)
	q.offset(i + 1).simd_store[1](0, q0 * fci + q1 * fcr)
	k.offset(i).simd_store[1](0, k0 * fcr - k1 * fci)
	k.offset(i + 1).simd_store[1](0, k0 * fci + k1 * fcr)

	# Save key,value at this time step (pos) to our kv cache
	let loff = l * config.seq_len * dim # kv cache layer offset for convenience
	let key_cache_row = state.key_cache.data.offset(loff + pos * dim)
	let value_cache_row = state.value_cache.data.offset(loff + pos * dim)
	memcpy[DType.float32](key_cache_row, state.k.data, config.dim)
	memcpy[DType.float32](value_cache_row, state.v.data, config.dim)

	# Multihead attention. Iterate over all heads
	for h in range(config.n_heads):
	# Get the query vector for this head
	let q = state.q.data.offset(h * head_size)

	# Attention scores for this head
	var att = state.att.data.offset(h * config.seq_len)

	# Iterate over all timesteps, including the current one
	for t in range(pos + 1):
	# Get the key vector for this head and at this timestep
	let k = state.key_cache.data.offset(loff + t * dim + h * head_size)
	# Calculate the attention score as the dot product of q and k
	var score: Float32 = 0.0
	for i in range(head_size):
	score += q.offset(i).simd_load[1](0) * k.offset(i).simd_load[1](0)
	score /= math.sqrt[DType.float32, 1](head_size)

	# Save the score to the attention buffer
	att.offset(t).simd_store[1](0, score)

	# Softmax the scores to get attention weights, from 0..pos inclusively
	softmax(att, pos + 1)

	# Weighted sum of the values, store back into xb
	let xb = state.xb.data.offset(h * head_size)
	memset_zero(xb, head_size)
	for t in range(pos + 1):
	# Get the value vector for this head and at this timestep
	let v = state.value_cache.data.offset(loff + t * dim + h * head_size)
	# Get the attention weight for this timestep
	let a = att.offset(t).simd_load[1](0)
	# Accumulate the weighted value into xb
	for i in range(head_size):
	let xbi = xb.offset(i).simd_load[1](0) + a * v.offset(i).simd_load[
	1
	](0)
	xb.offset(i).simd_store[1](0, xbi)
	# Final matrix multiplication to get the output of the attention
	tmpw.set_buf_ptr(weights.wo.data.offset(l * dim * dim), dim, dim)
	matmul(state.xb2, state.xb, tmpw, state.rt)

	# Residual connection back into x
	accum(x, state.xb2.data, dim)

	# FFN rmsnorm
	rmsnorm(state.xb.data, x, weights.rms_ffn_weight.data.offset(l * dim), dim)

	# Calculate self.w1(x) and self.w3(x) for FFN
	tmpw.set_buf_ptr(weights.w1.data.offset(l * dim * hidden_dim), hidden_dim, dim)
	matmul(state.hb, state.xb, tmpw, state.rt)

	tmpw.set_buf_ptr(weights.w3.data.offset(l * dim * hidden_dim), hidden_dim, dim)
	matmul(state.hb2, state.xb, tmpw, state.rt)

	# Apply SiLU activation function (silu(x) = x * sigmoid(x))
	for i in range(hidden_dim):
	let hbi = state.hb[i]
	state.hb[i] = hbi * (1.0 / (1.0 + math.exp(-hbi)))

	# Elementwise multiply with w3(x)
	for i in range(hidden_dim):
	state.hb[i] = state.hb[i] * state.hb2[i]

	# Final matrix multiplication to get the output of the FFN
	tmpw.set_buf_ptr(weights.w2.data.offset(l * dim * hidden_dim), dim, hidden_dim)
	matmul(state.xb, state.hb, tmpw, state.rt)

	# Residual connection
	accum(x, state.xb.data, dim)

	# Final rmsnorm
	rmsnorm(x, x, weights.rms_final_weight.data, dim)

	# Classifier into logits
	tmpw.set_buf_ptr(weights.wcls.data, config.vocab_size, dim)
	matmul(state.logits, state.x, tmpw, state.rt)


	fn argmax(v: Matrix) -> Int:
	# return argmax of v
	var max_i: Int = 0
	var max_p: Float32 = v[0]
	for i in range(v.cols):
	if v[i] > max_p:
	max_i = i
	max_p = v[i]
	return max_i


	fn sample(probabilities: Matrix) -> Int:
	let n = probabilities.cols
	# Sample index from probabilities, they must sum to 1
	# get random value within (min, max) float32 range
	let r = DTypePointer[DType.float32].alloc(1)
	rand[DType.float32](r, 1)
	var cdf: Float32 = 0.0
	for i in range(n):
	cdf += probabilities[i]
	if r.simd_load[1](0) < cdf:
	return i
	return n - 1 # In case of rounding errors


	fn str_lookup(str: PointerString, tok: Tokenizer) -> Int:
	for pos in range(tok.vocab_size):
	let s1 = tok.vocab[pos]
	var p1 = 0
	while s1[p1] != 0 and str[p1] != 0:
	if s1[p1] != str[p1]:
	break
	p1 += 1
	if s1[p1] != 0 or str[p1] != 0:
	continue
	return pos
	return -1


	fn bpe_encode(inout tokens: DynamicVector[Int], text: String, tok: Tokenizer):
	for pos in range(len(text)):
	let char = str_to_ptr(text[pos])
	let id = str_lookup(char, tok)

	if id == -1:
	print("Not a good prompt token at pos ", pos)
	return
	tokens.push_back(id)

	while True:
	var best_score = Float32(-1e10)
	var best_id = -1
	var best_idx = -1

	for i in range(len(tokens) - 1):
	# Check if we can merge the pair (tokens[i], tokens[i+1])
	let str = str_concat(tok.vocab[tokens[i]], tok.vocab[tokens[i + 1]])
	let id = str_lookup(str, tok)
	if id != -1 and tok.vocab_scores.load(id) > best_score:
	best_score = tok.vocab_scores.load(id)
	best_id = id
	best_idx = i

	if best_idx == -1:
	# We couldn't find any more pairs to merge, so we're done
	break

	# Merge the consecutive pair (best_idx, best_idx+1) into new token best_id
	tokens[best_idx] = best_id
	# Delete token at position best_idx+1, shift the entire sequence back 1
	var _tokens = DynamicVector[Int]()
	for i in range(0, best_idx + 1):
	_tokens.push_back(tokens[i])
	for i in range(best_idx + 2, len(tokens)):
	_tokens.push_back(tokens[i])
	tokens = _tokens


	fn print_str(s: PointerString):
	# print all chars till null character
	var p: Int = 0
	while s[p].to_int() != 0:
	print_no_newline(chr(s[p].to_int()))
	p += 1


	fn time_in_ms() -> Int:
	# Returns time in milliseconds for benchmarking the model speed
	return time.now() // 1_000_000


	fn print_usage():
	print("Usage: mojo llama2.mojo <checkpoint> [options]")
	print("Example: mojo llama2.mojo stories15M.bin -s 99 -n 256 -t 0.5 -i \"Llama is an animal\"")
	print("Options:")
	print(" -s <int> random seed, default time.now()")
	print(" -t <float> temperature in [0,1.0], default 1.0")
	print(" -n <int> number of steps to run for, default 256. 0 = max_seq_len")
	print(" -i <string> input prompt")


	fn main() raises:
	print("num hardware threads: ", num_cores())
	print("SIMD vector width: ", nelts)
	var tokenizer = StringRef("tokenizer.bin")
	var checkpoint = StringRef("stories15M.bin")
	var temperature = 0.9
	var steps = 256
	var prompt = String("")
	var rng_seed: Int = time.now()

	@parameter
	fn argparse() raises -> Int:
	let args = argv()
	if len(args) < 2:
	return 0
	checkpoint = args[1]
	for i in range(2, len(args), 2):
	if args[i] == "-p":
	print("Option not supported: ", args[i])
	if args[i] == "-n":
	steps = atol(args[i + 1])
	if args[i] == "-s":
	rng_seed = atol(args[i + 1])
	if args[i] == "-i":
	prompt = args[i + 1]
	if args[i] == "-t":
	let val = args[i + 1]
	temperature = 0.0
	# hacky parse float, keep only 1 digit
	for c in range(0, len(val)):
	if val[c] == ".":
	temperature += atol(val[c + 1]) * 0.1
	break
	else:
	temperature = atol(val[c])
	if temperature < -1e9 or temperature > (1 + 1e9):
	print("Wrong temperature value", temperature)
	return 0
	return 1

	let res = argparse()
	if res == 0:
	print_usage()
	return

	random.seed(rng_seed)
	var fbuf: FileBuf = FileBuf()
	var tbuf: FileBuf = FileBuf()
	var config: Config = Config()

	read_file(checkpoint, fbuf)
	print("checkpoint size: ", fbuf.size)
	config_init(config, fbuf)

	# negative vocab size is hacky way of signaling unshared weights. bit yikes.
	let shared_weights = 1 if config.vocab_size > 0 else 0
	config.vocab_size = (
	-config.vocab_size if config.vocab_size < 0 else config.vocab_size
	)

	let weights: TransformerWeights = TransformerWeights(config, shared_weights, fbuf)

	var tok: Tokenizer = Tokenizer(config.vocab_size)

	if steps <= 0 or steps > config.seq_len:
	steps = config.seq_len

	# Read in the tokenizer.bin file
	read_file(tokenizer, tbuf)
	tokenizer_init(tok, tbuf)

	# Create and initialize the application RunState
	var state = RunState(config)

	# Process the prompt, if any
	var prompt_tokens = DynamicVector[Int]()

	if prompt:
	bpe_encode(prompt_tokens, prompt, tok)

	# Start the main loop
	var start = 0 # Used to time our code, only initialized after the first iteration
	var next_token = 0 # Will store the next token in the sequence
	# Initialize with token 1 (=BOS), as done in Llama-2 sentencepiece tokenizer
	var token = 1

	# Position in the sequence
	var pos = 0
	while pos < steps:
	# Forward the transformer to get logits for the next token
	transformer(token, pos, config, state, weights)

	if pos < len(prompt_tokens):
	next_token = prompt_tokens[pos]
	else:
	# Sample the next token
	if temperature == 0.0:
	# Greedy argmax sampling: take the token with the highest probability
	next_token = argmax(state.logits)
	else:
	# Apply the temperature to the logits
	for q in range(config.vocab_size):
	state.logits[q] = state.logits[q] / temperature
	# Apply softmax to the logits to get the probabilities for the next token
	softmax(state.logits.data, config.vocab_size)
	# Sample from this distribution to get the next token
	next_token = sample(state.logits)

	var token_str: PointerString = tok.vocab[next_token]
	if token == 1 and token_str[0] == ord(" "):
	token_str = token_str.offset(1)

	print_str(token_str)

	# Advance forward
	token = next_token
	pos += 1

	if start == 0:
	start = time_in_ms()

	let end = time_in_ms()
	print("\nachieved tok/s: ", (steps - 1) / (end - start) * 1000)