Gradio-llama2.mojo / llama2.mojo
radames's picture
radames HF staff
a
0021428
from algorithm import sum
from algorithm import vectorize, parallelize
from builtin import string
from math import round
from memory import memset_zero, memcpy
from memory.buffer import Buffer
from memory.unsafe import DTypePointer
from python import Python
from random import rand
from read import BufReader, File
from runtime.llcl import num_cores, Runtime
from sys import argv
from tensor import Tensor, TensorShape, TensorSpec
# The SIMD vector width.
from sys.info import simdwidthof
import math
import os
import random
import time
var workers = 0
alias nelts = (2*simdwidthof[DType.float32]())
alias PointerString = Pointer[UInt8]
alias BufferPtrType = DTypePointer[DType.uint8]
alias BufferPtrFloat32 = DTypePointer[DType.float32]
alias PointerStrings = Pointer[PointerString]
alias TensorF32 = Tensor[DType.float32]
struct TensorSlice:
# Provides a view into a tensor representing a 1D slice on its first or first 2 dimensions.
# Same function signatures as Tensor but without owning the data.
var _data: BufferPtrFloat32
var _shape: TensorShape
fn __init__(inout self, t: TensorF32, layer: Int) raises:
let elements_per_layer = t.num_elements() // t.dim(0)
self._data = t.data().offset(layer * elements_per_layer)
if t.rank() == 2:
self._shape = TensorShape(t.dim(1))
elif t.rank() == 3:
self._shape = TensorShape(t.dim(1), t.dim(2))
else:
# Compiler complains if _shape not defined
self._shape = TensorShape(1)
raise Error("TensorSlice: rank greater than 3 not implemented.")
fn __init__(inout self, t: TensorF32, layer: Int, row: Int) raises:
let elements_per_layer = t.num_elements() // t.dim(0)
let elements_per_row = elements_per_layer // t.dim(1)
self._data = t.data().offset(
layer * elements_per_layer + row * elements_per_row
)
if t.rank() == 3:
self._shape = TensorShape(t.dim(2))
elif t.rank() == 1:
# Compiler complains if _shape not defined
self._shape = TensorShape(1)
raise Error(
"Trying to slice a 1D Tensor by layer and row. This requires a 3D"
" Tensor."
)
else:
# Compiler complains if _shape not defined
self._shape = TensorShape(1)
raise Error("TensorSlice: rank greater than 3 not implemented.")
fn data(self) -> BufferPtrFloat32:
return self._data
fn shape(self) -> TensorShape:
return self._shape
fn num_elements(self) -> Int:
return self._shape.num_elements()
fn dim(self, idx: Int) -> Int:
return self._shape[idx]
fn rank(self) -> Int:
return self._shape.rank()
fn simd_load[nelts: Int](self, idx: Int) -> SIMD[DType.float32, nelts]:
return self._data.simd_load[nelts](idx)
fn simd_load[nelts: Int](self, *indices: Int) -> SIMD[DType.float32, nelts]:
if len(VariadicList(indices)) > 2:
print(
"Warning: TensorSlice only supports 1D and 2D indexing. Results are"
" unlikely to be correct."
)
return self.simd_load[nelts](indices[0] * self._shape[1] + indices[1])
fn simd_load[
nelts: Int
](self, indices: StaticIntTuple[2]) -> SIMD[DType.float32, nelts]:
return self._data.simd_load[nelts](indices[0] * self._shape[1] + indices[1])
fn __getitem__(self, idx: Int) -> SIMD[DType.float32, 1]:
return self._data.simd_load[1](idx)
fn simd_store[nelts: Int](self, idx: Int, val: SIMD[DType.float32, nelts]):
return self._data.simd_store[nelts](idx, val)
fn __setitem__(self, idx: Int, val: SIMD[DType.float32, 1]):
return self.simd_store[1](idx, val)
fn read_val_int(inout buf: FileBuf) raises -> Int:
# DTypePointer[DType.ui8](buf.data).bitcast[DType.ui8]()
let data = buf.data.offset(buf.get_offset()).bitcast[DType.int32]()
let result = data.load(0)
buf.move_offset(4)
return result.to_int()
fn read_val_float32(inout buf: FileBuf) raises -> Float32:
# DTypePointer[DType.ui8](buf.data).bitcast[DType.ui8]()
let val = buf.data.offset(buf.get_offset()).bitcast[DType.float32]().load(0)
buf.move_offset(4)
return val
fn read_val_str(inout buf: FileBuf, slen: Int) raises -> PointerString:
let str = PointerString.alloc(slen + 1)
for i in range(slen):
str.store(i, buf.data.load(buf.get_offset()))
buf.move_offset(1)
str.store(slen, 0)
return str
fn str_len(s: PointerString) -> Int:
var len = 0
while s[len] != 0:
len += 1
return len
# not optimal concat
fn str_concat(s1: PointerString, s2: PointerString) -> PointerString:
let l1 = str_len(s1)
let l2 = str_len(s2)
let str = PointerString.alloc(l1 + l2 + 1)
memcpy[UInt8](str, s1, l1)
memcpy[UInt8](str.offset(l1), s2, l2)
str.store(l1 + l2, 0)
return str
fn str_to_ptr(s: String) -> PointerString:
let ret = PointerString.alloc(len(s) + 1)
for i in range(len(s)):
ret.store(i, ord(s[i]))
ret.store(len(s), 0)
return ret
fn string_compare(a: PointerString, b: PointerString) -> Int:
var index = 0
while a[index] != 0 and b[index] != 0:
if a[index] < b[index]:
return -1
if a[index] > b[index]:
return 1
index += 1
if a[index] != 0 and b[index] == 0:
return 1
if a[index] == 0 and b[index] != 0:
return -1
return 0
# Quicksort helper function to find the partition position
fn partition(
inout array: PointerStrings, inout indices: DynamicVector[Int], low: Int, high: Int
) -> Int:
let pivot = array[high]
var ii = low - 1
for jj in range(low, high):
if string_compare(pivot, array[jj]) == 1:
# If element smaller than pivot, swap
ii = ii + 1
let tmp = array[ii]
let tmp_idx = indices[ii]
array.store(ii, array[jj])
indices[ii] = indices[jj]
array.store(jj, tmp)
indices[jj] = tmp_idx
# Swap the pivot element
let tmp = array[ii + 1]
let tmp_idx = indices[ii + 1]
array.store(ii + 1, array[high])
indices[ii + 1] = indices[high]
array.store(high, tmp)
indices[high] = tmp_idx
return ii + 1
fn quicksort(
inout array: PointerStrings, inout indices: DynamicVector[Int], low: Int, high: Int
):
if low < high:
let pi = partition(array, indices, low, high)
quicksort(array, indices, low, pi - 1)
quicksort(array, indices, pi + 1, high)
struct FileBuf:
var data: BufferPtrType
var offset: Int
var size: Int
fn __init__(inout self):
self.data = BufferPtrType()
self.offset = 0
self.size = 0
fn __del__(owned self):
self.data.free()
fn move_offset(inout self, size: Int) raises:
let new_offset = self.offset + size
if new_offset > self.size:
raise Error("Resulting offset will be past the end of the FileBuf")
if new_offset < 0:
raise Error("Resulting offset will be before the beginning of the FileBuf")
self.offset = new_offset
fn bitcast_offset_f32(inout self, size: Int) raises -> BufferPtrFloat32:
let ret = self.data.offset(self.offset).bitcast[DType.float32]()
self.move_offset(size * sizeof[DType.float32]())
return ret
fn get_offset(self) raises -> Int:
if self.offset > self.size:
raise Error("Offset is past the end of the FileBuf")
if self.offset < 0:
raise Error("Offset is before the beginning of the FileBuf")
return self.offset
fn wrap(token: PointerString) -> PointerString:
if string_compare(token, str_to_ptr("\\n")) == 0:
return str_to_ptr("<0x0A>")
if string_compare(token, str_to_ptr("\\t")) == 0:
return str_to_ptr("<0x09>")
if string_compare(token, str_to_ptr("'")) == 0:
return str_to_ptr("<0x27>")
elif string_compare(token, str_to_ptr('"')) == 0:
return str_to_ptr("<0x22>")
return token
struct Tokenizer:
var vocab: PointerStrings
var vocab_scores: BufferPtrFloat32
var max_token_length: Int
var vocab_size: Int
var sorted_vocab: PointerStrings
var sorted_indices: DynamicVector[Int]
fn __init__(inout self, vocab_size: Int, inout buf: FileBuf) raises -> None:
self.vocab_size = vocab_size
self.max_token_length = read_val_int(buf)
self.vocab_scores = BufferPtrFloat32.alloc(self.vocab_size)
self.vocab = PointerStrings.alloc(self.vocab_size)
# lazy load sorted vocab
self.sorted_vocab = PointerStrings.alloc(0)
self.sorted_indices = DynamicVector[Int](0)
# read vocab_scores & vocab values (tokens)
for i in range(0, self.vocab_size):
self.vocab_scores.store(i, read_val_float32(buf))
let slen = read_val_int(buf)
self.vocab.store(i, read_val_str(buf, slen))
return None
# sort vocab by string_compare
fn sort(inout self) -> None:
if len(self.sorted_indices) < self.vocab_size:
self.sorted_indices = DynamicVector[Int](self.vocab_size)
self.sorted_vocab = PointerStrings.alloc(self.vocab_size)
for ii in range(self.vocab_size):
self.sorted_vocab.store(ii, self.vocab[ii])
self.sorted_indices.push_back(ii)
let n = self.vocab_size
quicksort(self.sorted_vocab, self.sorted_indices, 0, n - 1)
return None
# Binary search that returns -1 if string is not found
fn find(inout self, token_o: PointerString) -> Int:
let token = wrap(token_o)
let n = self.vocab_size
if len(self.sorted_indices) < n:
self.sort()
var left = 0
var right = n - 1
while left <= right:
let mid = left + (right - left) // 2
let comparison = string_compare(self.sorted_vocab[mid], token)
if comparison == 0:
return self.sorted_indices[mid]
if comparison < 0:
left = mid + 1
else:
right = mid - 1
return -1
struct Config:
var dim: Int
var kv_dim: Int
var hidden_dim: Int
var n_layers: Int
var n_heads: Int
var n_kv_heads: Int
var kv_mul: Int
var vocab_size: Int
var seq_len: Int
var head_size: Int
fn __init__(inout self):
self.dim = 0
self.hidden_dim = 0
self.n_layers = 0
self.n_heads = 0
self.n_kv_heads = 0
self.vocab_size = 0
self.seq_len = 0
self.kv_dim = 0
self.kv_mul = 0
self.head_size = 0
struct RunState:
var x: TensorF32 # activation at current time stamp (dim,)
var xb: TensorF32 # same, but inside a residual branch (dim,)
var xb2: TensorF32 # an additional buffer just for convenience (dim,)
var hb: TensorF32 # buffer for hidden dimension in the ffn (hidden_dim,)
var hb2: TensorF32 # buffer for hidden dimension in the ffn (hidden_dim,)
var q: TensorF32 # query (dim,)
var k: TensorSlice # key (kv_dim,)
var v: TensorSlice # value (kv_dim,)
var att: TensorF32 # buffer for scores/attention values (n_heads, seq_len)
var logits: TensorF32 # output logits
var key_cache: TensorF32 # (layer, seq_len, dim)
var value_cache: TensorF32 # (layer, seq_len, dim)
fn __init__(inout self, config: Config) raises:
self.x = TensorF32(config.dim)
self.xb = TensorF32(config.dim)
self.xb2 = TensorF32(config.dim)
self.hb = TensorF32(config.hidden_dim)
self.hb2 = TensorF32(config.hidden_dim)
self.q = TensorF32(config.dim)
self.att = TensorF32(config.n_heads, config.seq_len)
self.logits = TensorF32(config.vocab_size)
self.key_cache = TensorF32(config.n_layers, config.seq_len, config.kv_dim)
self.value_cache = TensorF32(config.n_layers, config.seq_len, config.kv_dim)
# So their updates flow to the caches, k and v are slices with shared memory.
# Initialize with placeholders. The real tensors reference layer and position during forward pass.
self.k = TensorSlice(TensorF32(TensorShape(1, config.kv_dim)), 1)
self.v = TensorSlice(TensorF32(TensorShape(1, config.kv_dim)), 1)
struct TransformerWeights:
var token_embedding_table: TensorF32
var freq_cis_real: TensorF32
var freq_cis_imag: TensorF32
var rms_att_weight: TensorF32
var wq: TensorF32
var wk: TensorF32
var wv: TensorF32
var wo: TensorF32
var rms_ffn_weight: TensorF32
var w1: TensorF32
var w3: TensorF32
var w2: TensorF32
var rms_final_weight: TensorF32
var wcls: TensorF32
fn __init__(
inout self, config: Config, shared_weights: Int, inout buf: FileBuf
) raises:
fn load_weights(inout buf: FileBuf, *dims: Int) raises -> TensorF32:
# Ensure returned Tensor doesn't share a pointer with FileBuf
let shape = TensorShape(dims)
let result_data = BufferPtrFloat32.alloc(shape.num_elements())
memcpy(
result_data,
buf.bitcast_offset_f32(shape.num_elements()),
shape.num_elements(),
)
return TensorF32(result_data, shape)
self.token_embedding_table = load_weights(buf, config.vocab_size, config.dim)
self.rms_att_weight = load_weights(buf, config.n_layers, config.dim)
self.wq = load_weights(buf, config.n_layers, config.dim, config.dim)
self.wk = load_weights(buf, config.n_layers, config.kv_dim, config.dim)
self.wv = load_weights(buf, config.n_layers, config.kv_dim, config.dim)
self.wo = load_weights(buf, config.n_layers, config.dim, config.dim)
self.rms_ffn_weight = load_weights(buf, config.n_layers, config.dim)
self.w1 = load_weights(buf, config.n_layers, config.hidden_dim, config.dim)
self.w2 = load_weights(buf, config.n_layers, config.dim, config.hidden_dim)
self.w3 = load_weights(buf, config.n_layers, config.hidden_dim, config.dim)
self.rms_final_weight = load_weights(buf, config.dim)
# maybe need modifying for different model
# config.head_size // 2 for stories and tinyllama-1.1
self.freq_cis_real = load_weights(buf, config.seq_len, config.head_size // 2)
self.freq_cis_imag = load_weights(buf, config.seq_len, config.head_size // 2)
if shared_weights:
self.wcls = self.token_embedding_table
else:
self.wcls = load_weights(buf, config.vocab_size, config.dim)
fn read_file(file_name: String, inout buf: FileBuf) raises:
let _os = Python.import_module("os")
let ff_size = _os.path.getsize(file_name)
let cp_size = string.atol(ff_size.to_string())
let cp_buf: BufferPtrType = BufferPtrType.alloc(cp_size)
# set window buffer to read binary data from file
let f = File(file_name)
var reader = BufReader[4096](f ^)
var bytes_read = 1
var offset = 0
while bytes_read > 0:
let buf = Buffer[4096, DType.uint8](cp_buf.offset(offset))
bytes_read = reader.read(buf)
offset += bytes_read
reader.do_nothing() # keeps lifetimes working
buf.data = cp_buf
buf.size = cp_size
buf.offset = 0
return None
fn config_init(inout config: Config, inout buf: FileBuf) raises:
config.dim = read_val_int(buf)
config.hidden_dim = read_val_int(buf)
config.n_layers = read_val_int(buf)
config.n_heads = read_val_int(buf)
config.n_kv_heads = read_val_int(buf)
config.vocab_size = read_val_int(buf)
config.seq_len = read_val_int(buf)
config.head_size = config.dim // config.n_heads
config.kv_dim = (config.n_kv_heads * config.dim) // config.n_heads
config.kv_mul = config.n_heads // config.n_kv_heads
return None
@always_inline
fn accum(inout a: TensorF32, b: TensorF32) -> None:
let size = a.dim(0)
@parameter
fn _acc[_nelts: Int](j: Int):
a.simd_store[_nelts](j, a.simd_load[_nelts](j) + b.simd_load[_nelts](j))
vectorize[nelts, _acc](size)
@always_inline
fn rmsnorm(
inout o: BufferPtrFloat32, x: BufferPtrFloat32, weight: BufferPtrFloat32, size: Int
) -> None:
# Calculate sum of squares
var tmp = SIMD[DType.float32, nelts](0)
@parameter
fn _sum2[_nelts: Int](j: Int):
if _nelts < nelts:
tmp[0] += (x.offset(j).simd_load[_nelts](0) ** 2).reduce_add()
else:
tmp += x.offset(j).simd_load[nelts](0) ** 2
vectorize[nelts, _sum2](size)
var ss: Float32 = tmp.reduce_add()
ss = ss / size + 1e-5
ss = 1.0 / math.sqrt(ss)
# Normalize and scale
@parameter
fn _norm[_nelts: Int](j: Int):
let val = weight.simd_load[_nelts](j) * ss * x.simd_load[_nelts](j)
o.offset(j).simd_store[_nelts](0, val)
vectorize[nelts, _norm](size)
@always_inline
fn rmsnorm(inout o: TensorF32, x: TensorF32, weight: TensorF32):
rmsnorm(o._ptr, x.data(), weight.data(), weight.dim(weight.rank() - 1))
@always_inline
fn rmsnorm(inout o: TensorF32, x: TensorF32, weight: TensorSlice):
rmsnorm(o._ptr, x.data(), weight.data(), weight.dim(weight.rank() - 1))
@always_inline
fn softmax(inout x: TensorF32) -> None:
softmax(x, 0, x.dim(0))
@always_inline
fn softmax(inout x: TensorF32, start: Int, end: Int):
var max_val: Float32 = -1e9
@parameter
fn _max[_nelts: Int](ii: Int):
let val = x.simd_load[_nelts](start + ii).reduce_max()
if val > max_val:
max_val = val
vectorize[nelts, _max](end - start)
var ssum: Float32 = 0.0
@parameter
fn _exp[_nelts: Int](ii: Int):
x.simd_store[_nelts](
start + ii, math.exp(x.simd_load[_nelts](start + ii) - max_val)
)
ssum += x.simd_load[_nelts](start + ii).reduce_add()
vectorize[nelts, _exp](end - start)
@parameter
fn _norm[_nelts: Int](ii: Int):
x.simd_store[_nelts](start + ii, x.simd_load[_nelts](start + ii) / ssum)
vectorize[nelts, _norm](end - start)
@always_inline
fn matmul_parallelized(C: BufferPtrFloat32,A: BufferPtrFloat32,B: BufferPtrFloat32,rows: Int,cols: Int,):
@parameter
fn compute_row(i: Int):
var tmp = SIMD[DType.float32, nelts](0)
@parameter
fn dot[_nelts: Int](j: Int):
if _nelts < nelts: # take care of tail array elements with length < nelts
tmp[0] += (
A.simd_load[_nelts](j) * B.simd_load[_nelts](i * cols + j)
).reduce_add()
else:
tmp += A.simd_load[nelts](j) * B.simd_load[nelts](i * cols + j)
vectorize[nelts, dot](cols)
C.store(i, tmp.reduce_add())
parallelize[compute_row](rows, workers)
@always_inline
fn matmul(C: TensorF32, A: TensorF32, B: TensorF32) raises:
# B (d,n) @ A (n,) -> C (d,)
matmul_dimension_checks(A.shape(), B.shape())
matmul_parallelized(C.data(), A.data(), B.data(), B.dim(0), B.dim(1))
@always_inline
fn matmul(C: TensorF32, A: TensorF32, B: TensorSlice) raises:
# B (d,n) @ A (n,) -> C (d,)
matmul_dimension_checks(A.shape(), B.shape())
matmul_parallelized(C.data(), A.data(), B.data(), B.dim(0), B.dim(1))
@always_inline
fn matmul(C: TensorSlice, A: TensorF32, B: TensorSlice) raises:
# B (d,n) @ A (n,) -> C (d,)
matmul_dimension_checks(A.shape(), B.shape())
matmul_parallelized(C.data(), A.data(), B.data(), B.dim(0), B.dim(1))
fn matmul_dimension_checks(a: TensorShape, b: TensorShape) raises:
if a[0] != b[1]:
raise Error(
"matmul dimension mismatch. A rows (dim 0) not equal to B columns (dim 1)"
)
if b.rank() != 2:
raise Error("matmul expects B to be a 2D matrix")
# Apply RoPE rotation to the q and k vectors for each head
# rotate odd and even dim
@always_inline
fn rope_rotation_llama(
inout state: RunState,
freq_cis_real_row: TensorSlice,
freq_cis_imag_row: TensorSlice,
config: Config,
) -> None:
# stories model, llama2
let head_size = config.head_size
@parameter
fn head_loop(i:Int):
# Simple vectorization with (head_size // 2) steps gave junk transformer output.
# Maybe because the nelt ranges end up overlapping between the steps.
for j in range(0, config.head_size, 2):
let fcr = freq_cis_real_row[j // 2]
let fci = freq_cis_imag_row[j // 2]
let q0 = state.q[i * head_size + j]
let q1 = state.q[i * head_size + j + 1]
state.q[i * head_size + j] = q0 * fcr - q1 * fci
state.q[i * head_size + j + 1] = q0 * fci + q1 * fcr
if i < config.n_kv_heads:
let k0 = state.k[i * head_size + j]
let k1 = state.k[i * head_size + j + 1]
state.k[i * head_size + j] = k0 * fcr - k1 * fci
state.k[i * head_size + j + 1] = k0 * fci + k1 * fcr
parallelize[head_loop](config.n_heads, workers)
@always_inline
fn transformer(
token: Int,
pos: Int,
config: Config,
inout state: RunState,
weights: TransformerWeights,
) raises -> None:
# A few convenience variables
let dim = config.dim
let hidden_dim = config.hidden_dim
let head_size = config.head_size
let kv_dim = config.kv_dim
let kv_mul = config.kv_mul
# Copy the token embedding into x
let content_row = weights.token_embedding_table.data().offset(token * dim)
memcpy[DType.float32](state.x.data(), content_row, dim)
# Pluck out the "pos" row of freq_cis_real and freq_cis_imag
let freq_cis_real_row = TensorSlice(weights.freq_cis_real, pos)
let freq_cis_imag_row = TensorSlice(weights.freq_cis_imag, pos)
# Forward all the layers
for l in range(config.n_layers):
# Attention rmsnorm
rmsnorm(state.xb, state.x, TensorSlice(weights.rms_att_weight, l))
# QKV matmuls for this position
matmul(state.q, state.xb, TensorSlice(weights.wq, l))
let loff = l * config.seq_len * config.kv_dim
state.k = TensorSlice(state.key_cache, l, pos)
matmul(state.k, state.xb, TensorSlice(weights.wk, l))
state.v = TensorSlice(state.value_cache, l, pos)
matmul(state.v, state.xb, TensorSlice(weights.wv, l))
# Apply RoPE rotation to the q and k vectors for each head
rope_rotation_llama(state, freq_cis_real_row, freq_cis_imag_row, config)
memset_zero(state.xb.data(), state.xb.num_elements())
# Multihead attention. Iterate over all heads in parallel.
@parameter
fn loop_over_heads(h:Int):
# Get the query vector for this head
let q_offset = h * head_size
# Index of attention scores for this head
let att_offset = h * config.seq_len
# Iterate over all timesteps, including the current one
for t in range(pos + 1):
# Starting index of the key vector for this head and at this timestep
let k_offset = loff + t * kv_dim + (h // kv_mul) * head_size
# Calculate the attention score as the dot product of q and k
var score: Float32 = 0.0
@parameter
fn score_fn[_nelts: Int](i: Int):
score += (
state.q.simd_load[_nelts](q_offset + i)
* state.key_cache.simd_load[_nelts](k_offset + i)
).reduce_add()
vectorize[nelts, score_fn](head_size)
score /= math.sqrt[DType.float32, 1](head_size)
# Save the score to the attention buffer
state.att[att_offset + t] = score
# Softmax the scores to get attention weights, from 0..pos inclusively
softmax(state.att, att_offset, att_offset + pos + 1)
# Weighted sum of the values, store back into xb
let xb_offset = h * head_size
for t in range(pos + 1):
# Starting index of the value vector for this head and at this timestep
let v_offset = loff + t * kv_dim + (h // kv_mul) * head_size
# Get the attention weight for this timestep
let a = state.att[att_offset + t]
# Accumulate the weighted value into xb
@parameter
fn xb_accumulate[_nelts: Int](i: Int):
let xbi = state.xb.simd_load[_nelts](
xb_offset + i
) + a * state.value_cache.simd_load[_nelts](v_offset + i)
state.xb.simd_store[_nelts](xb_offset + i, xbi)
vectorize[nelts, xb_accumulate](head_size)
parallelize[loop_over_heads](config.n_heads, workers)
# Final matrix multiplication to get the output of the attention
matmul(state.xb2, state.xb, TensorSlice(weights.wo, l))
# Residual connection back into x
accum(state.x, state.xb2)
# FFN rmsnorm
rmsnorm(state.xb, state.x, TensorSlice(weights.rms_ffn_weight, l))
# Calculate self.w1(x) and self.w3(x) for FFN
matmul(state.hb, state.xb, TensorSlice(weights.w1, l))
matmul(state.hb2, state.xb, TensorSlice(weights.w3, l))
@parameter
fn silu[_nelts: Int](i: Int):
let initial_hb = state.hb.simd_load[_nelts](i)
# Apply SiLU activation function (silu(x) = x * sigmoid(x))
let hbi = initial_hb * (1.0 / (1.0 + math.exp(-initial_hb)))
# Elementwise multiply with w3(x)
state.hb.simd_store[_nelts](i, hbi * state.hb2.simd_load[_nelts](i))
vectorize[nelts, silu](hidden_dim)
# Final matrix multiplication to get the output of the FFN
matmul(state.xb, state.hb, TensorSlice(weights.w2, l))
# Residual connection
accum(state.x, state.xb)
# Final rmsnorm
rmsnorm(state.x, state.x, weights.rms_final_weight)
# Classifier into logits
matmul(state.logits, state.x, weights.wcls)
fn argmax(v: TensorF32) -> Int:
# return argmax of v
var max_i: Int = 0
var max_p: Float32 = v[0]
for i in range(v.dim(0)):
if v[i] > max_p:
max_i = i
max_p = v[i]
return max_i
fn sample(probabilities: TensorF32) -> Int:
let n = probabilities.dim(0)
# Sample index from probabilities, they must sum to 1
# get random value within (min, max) float32 range
let r = DTypePointer[DType.float32].alloc(1)
rand[DType.float32](r, 1)
var cdf: Float32 = 0.0
for i in range(n):
cdf += probabilities[i]
if r.load(0) < cdf:
return i
return n - 1 # In case of rounding errors
fn bpe_encode(inout tokens: DynamicVector[Int], text: String, inout tok: Tokenizer):
for pos in range(len(text)):
let char = str_to_ptr(text[pos])
let id = tok.find(char)
if id == -1:
print("Not a good prompt token at pos ", pos)
return
tokens.push_back(id)
while True:
var best_score = Float32(-1e10)
var best_id = -1
var best_idx = -1
for i in range(len(tokens) - 1):
# Check if we can merge the pair (tokens[i], tokens[i+1])
let str = str_concat(tok.vocab[tokens[i]], tok.vocab[tokens[i + 1]])
let id = tok.find(str)
if id != -1 and tok.vocab_scores.load(id) > best_score:
best_score = tok.vocab_scores.load(id)
best_id = id
best_idx = i
if best_idx == -1:
# We couldn't find any more pairs to merge, so we're done
break
# Merge the consecutive pair (best_idx, best_idx+1) into new token best_id
tokens[best_idx] = best_id
# Delete token at position best_idx+1, shift the entire sequence back 1
var _tokens = DynamicVector[Int]()
for i in range(0, best_idx + 1):
_tokens.push_back(tokens[i])
for i in range(best_idx + 2, len(tokens)):
_tokens.push_back(tokens[i])
tokens = _tokens
fn str2num(d: Int) -> Int:
# covert Hex to decimal
if d >= ord("A"):
return d - ord("A") + 10
return d - ord("0")
fn print_str(s: PointerString):
# print raw byte like <0x0A>
if (s[1].to_int() == ord("0")) and (s[2].to_int() == ord("x")):
let d1: Int = s[3].to_int()
let d2: Int = s[4].to_int()
print_no_newline(chr(str2num(d1) * 16 + str2num(d2)))
return
# print all chars till null character
var p: Int = 0
while s[p].to_int() != 0:
print_no_newline(chr(s[p].to_int()))
p += 1
fn time_in_ms() -> Int:
# Returns time in milliseconds for benchmarking the model speed
return time.now() // 1_000_000
fn print_usage():
print("Usage: mojo llama2.mojo <checkpoint> [options]")
print(
'Example: mojo llama2.mojo stories15M.bin -s 99 -n 256 -t 0.5 -i "Llama is an'
' animal"'
)
print("Options:")
print(" -s <int> random seed, default time.now()")
print(" -t <float> temperature in [0,1.0], default 1.0")
print(" -n <int> number of steps to run for, default 256. 0 = max_seq_len")
print(" -i <string> input prompt")
print(" -z tokenizer path")
print(" -j number of workers to use, default num_cores()")
fn main() raises:
workers = num_cores()
var tokenizer = StringRef("tokenizer.bin")
var checkpoint = StringRef("stories15M.bin")
var temperature = 0.9
var steps = 256
var prompt = String("")
var rng_seed: Int = time.now()
@parameter
fn argparse() raises -> Int:
let args = argv()
if len(args) < 2:
return 0
checkpoint = args[1]
for i in range(2, len(args), 2):
if args[i] == "-p":
print("Option not supported: ", args[i])
if args[i] == "-n":
steps = atol(args[i + 1])
if args[i] == "-z":
tokenizer = args[i + 1]
if args[i] == "-s":
rng_seed = atol(args[i + 1])
if args[i] == "-i":
prompt = args[i + 1]
if args[i] == "-j":
workers = atol(args[i + 1])
if args[i] == "-t":
let val = args[i + 1]
temperature = 0.0
# hacky parse float, keep only 1 digit
for c in range(0, len(val)):
if val[c] == ".":
temperature += atol(val[c + 1]) * 0.1
break
else:
temperature = atol(val[c])
if temperature < -1e9 or temperature > (1 + 1e9):
print("Wrong temperature value", temperature)
return 0
return 1
let res = argparse()
if res == 0:
print_usage()
return
print("num parallel workers:", workers, " SIMD width:", nelts)
random.seed(rng_seed)
var fbuf: FileBuf = FileBuf()
var tbuf: FileBuf = FileBuf()
var config: Config = Config()
read_file(checkpoint, fbuf)
config_init(config, fbuf)
# negative vocab size is hacky way of signaling unshared weights. bit yikes.
let shared_weights = 1 if config.vocab_size > 0 else 0
config.vocab_size = (
-config.vocab_size if config.vocab_size < 0 else config.vocab_size
)
let weights: TransformerWeights = TransformerWeights(config, shared_weights, fbuf)
if steps <= 0 or steps > config.seq_len:
steps = config.seq_len
# Read in the tokenizer.bin file
read_file(tokenizer, tbuf)
var tok = Tokenizer(config.vocab_size, tbuf)
# print the layers number and vocab size
print("checkpoint size: ", fbuf.size, "[", fbuf.size // 1024 // 1024, "MB ]",
"| n layers:", config.n_layers, "| vocab size:", tok.vocab_size)
# Create and initialize the application RunState
var state = RunState(config)
# Process the prompt, if any
var prompt_tokens = DynamicVector[Int]()
if prompt:
bpe_encode(prompt_tokens, prompt, tok)
# Start the main loop
var start = 0 # Used to time our code, only initialized after the first iteration
var next_token = 0 # Will store the next token in the sequence
# Initialize with token 1 (=BOS), as done in Llama-2 sentencepiece tokenizer
var token = 1
# Position in the sequence
var pos = 0
while pos < steps:
# Forward the transformer to get logits for the next token
transformer(token, pos, config, state, weights)
if pos < len(prompt_tokens):
next_token = prompt_tokens[pos]
else:
# Sample the next token
if temperature == 0.0:
# Greedy argmax sampling: take the token with the highest probability
next_token = argmax(state.logits)
else:
# Apply the temperature to the logits
for q in range(config.vocab_size):
state.logits[q] = state.logits[q] / temperature
# Apply softmax to the logits to get the probabilities for the next token
softmax(state.logits)
# Sample from this distribution to get the next token
next_token = sample(state.logits)
# Finish generating when EOS, BOS appear
if next_token == 1 or next_token == 2:
break
var token_str: PointerString = tok.vocab[next_token]
if token == 1 and token_str[0] == ord(" "):
token_str = token_str.offset(1)
print_str(token_str)
# Advance forward
token = next_token
pos += 1
if start == 0:
start = time_in_ms()
let end = time_in_ms()
print("\nachieved tok/s: ", (pos - 1) / (end - start) * 1000)