Spaces:

radames
/

Gradio-llama2.mojo

Runtime error

App Files Files Community

radames commited on Sep 24, 2023

Commit

5c30f1d

•

1 Parent(s): 264c8c8

update

Browse files

Files changed (2) hide show

llama2.mojo +273 -129
t260.bin +3 -0

llama2.mojo CHANGED Viewed

@@ -133,26 +133,27 @@ struct Matrix:
         self.data.simd_store[nelts](z * self.layers + y * self.cols + x, val)
-fn read_val_int(inout buf: FileBuf) -> Int:
     # DTypePointer[DType.ui8](buf.data).bitcast[DType.ui8]()
-    let data = buf.data.offset(buf.offset).bitcast[DType.uint32]()
-    let result = data.simd_load[1](0)
-    buf.offset += 4
     return result.to_int()
-fn read_val_float32(inout buf: FileBuf) -> Float32:
     # DTypePointer[DType.ui8](buf.data).bitcast[DType.ui8]()
-    let val = buf.data.offset(buf.offset).bitcast[DType.float32]().simd_load[1](0)
-    buf.offset += 4
     return val
-fn read_val_str(inout buf: FileBuf, slen: Int) -> PointerString:
     let str = PointerString.alloc(slen + 1)
     for i in range(slen):
-        str.store(i, buf.data.simd_load[1](buf.offset))
-        buf.offset += 1
     str.store(slen, 0)
     return str
@@ -168,7 +169,7 @@ fn str_concat(s1: PointerString, s2: PointerString) -> PointerString:
     while s2[l2] != 0:
         l2 += 1
-    let str = PointerString.alloc(l1 + l2)
     memcpy[UInt8](str, s1, l1)
     memcpy[UInt8](str.offset(l1), s2, l2)
     str.store(l1 + l2, 0)
@@ -183,6 +184,63 @@ fn str_to_ptr(s: String) -> PointerString:
     return ret
 struct FileBuf:
     var data: BufferPtrType
     var offset: Int
@@ -193,36 +251,95 @@ struct FileBuf:
         self.offset = 0
         self.size = 0
-    fn move_offset(inout self, size: Int):
-        self.offset += size
-    fn bitcast_offset_float32(inout self, size: Int) -> BufferPtrFloat32:
         let ret = self.data.offset(self.offset).bitcast[DType.float32]()
-        self.offset += size * sizeof[DType.float32]()
         return ret
 struct Tokenizer:
     var vocab: PointerStrings
     var vocab_scores: BufferPtrFloat32
     var max_token_length: Int
     var vocab_size: Int
-    fn __init__(inout self, vocab_size: Int):
         self.vocab_size = vocab_size
-        self.vocab = PointerStrings.alloc(vocab_size)
-        self.vocab_scores = BufferPtrFloat32.alloc(vocab_size)
-        self.max_token_length = 0
 struct Config:
     var dim: Int
     var hidden_dim: Int
     var n_layers: Int
     var n_heads: Int
     var n_kv_heads: Int
     var vocab_size: Int
     var seq_len: Int
     fn __init__(inout self):
         self.dim = 0
@@ -232,6 +349,9 @@ struct Config:
         self.n_kv_heads = 0
         self.vocab_size = 0
         self.seq_len = 0
 struct RunState:
@@ -241,8 +361,8 @@ struct RunState:
     var hb: Matrix  # buffer for hidden dimension in the ffn (hidden_dim,)
     var hb2: Matrix  # buffer for hidden dimension in the ffn (hidden_dim,)
     var q: Matrix  # query (dim,)
-    var k: Matrix  # key (dim,)
-    var v: Matrix  # value (dim,)
     var att: Matrix  # buffer for scores/attention values (n_heads, seq_len)
     var logits: Matrix  # output logits
     var key_cache: Matrix  # (layer, seq_len, dim)
@@ -262,17 +382,15 @@ struct RunState:
         self.hb2.alloc_zero()
         self.q = Matrix(config.dim)
         self.q.alloc_zero()
-        self.k = Matrix(config.dim)
-        self.k.alloc_zero()
-        self.v = Matrix(config.dim)
-        self.v.alloc_zero()
         self.att = Matrix(config.n_heads, config.seq_len)
         self.att.alloc_zero()
         self.logits = Matrix(config.vocab_size)
         self.logits.alloc_zero()
-        self.key_cache = Matrix(config.n_layers, config.seq_len, config.dim)
         self.key_cache.alloc_zero()
-        self.value_cache = Matrix(config.n_layers, config.seq_len, config.dim)
         self.value_cache.alloc_zero()
         self.rt = Runtime(num_cores() // 2)
@@ -293,7 +411,7 @@ struct TransformerWeights:
     var rms_final_weight: Matrix
     var wcls: Matrix
-    fn __init__(inout self, config: Config, shared_weights: Int, inout buf: FileBuf):
         self.token_embedding_table = Matrix(config.vocab_size, config.dim)
         # set buf ptr to buf data from file
         self.token_embedding_table.set_buf_ptr(
@@ -305,9 +423,9 @@ struct TransformerWeights:
         )
         self.wq = Matrix(config.n_layers, config.dim, config.dim)
         self.wq.set_buf_ptr(buf.bitcast_offset_float32(self.wq.size()))
-        self.wk = Matrix(config.n_layers, config.dim, config.dim)
         self.wk.set_buf_ptr(buf.bitcast_offset_float32(self.wk.size()))
-        self.wv = Matrix(config.n_layers, config.dim, config.dim)
         self.wv.set_buf_ptr(buf.bitcast_offset_float32(self.wv.size()))
         self.wo = Matrix(config.n_layers, config.dim, config.dim)
         self.wo.set_buf_ptr(buf.bitcast_offset_float32(self.wo.size()))
@@ -369,64 +487,77 @@ fn config_init(inout config: Config, inout buf: FileBuf) raises:
     config.n_kv_heads = read_val_int(buf)
     config.vocab_size = read_val_int(buf)
     config.seq_len = read_val_int(buf)
-    return None
-fn tokenizer_init(inout tok: Tokenizer, inout buf: FileBuf) -> None:
-    tok.max_token_length = read_val_int(buf)
-    tok.vocab_scores = BufferPtrFloat32.alloc(tok.vocab_size)
-    tok.vocab = PointerStrings.alloc(tok.vocab_size)
-    # read vocab_scores & vocab values (tokens)
-    for i in range(0, tok.vocab_size):
-        tok.vocab_scores.simd_store[1](i, read_val_float32(buf))
-        let slen = read_val_int(buf)
-        tok.vocab.store(i, read_val_str(buf, slen))
-    tok.vocab_scores = buf.data.offset(buf.offset).bitcast[DType.float32]()
-    buf.offset += tok.vocab_size * 4
     return None
 fn accum(inout a: BufferPtrFloat32, b: BufferPtrFloat32, size: Int) -> None:
-    for i in range(size):
-        let val = a.offset(i).simd_load[1](0) + b.offset(i).simd_load[1](0)
-        a.offset(i).simd_store[1](0, val)
 fn rmsnorm(
     inout o: BufferPtrFloat32, x: BufferPtrFloat32, weight: BufferPtrFloat32, size: Int
 ) -> None:
     # Calculate sum of squares
-    var ss: Float32 = 0.0
-    for i in range(size):
-        let xx = x.offset(i).simd_load[1](0) ** 2
-        ss += xx
     ss = ss / size + 1e-5
     ss = 1.0 / math.sqrt(ss)
     # Normalize and scale
-    for j in range(size):
-        let val = weight.offset(j).simd_load[1](0) * (ss * x.offset(j).simd_load[1](0))
-        o.offset(j).simd_store[1](0, val)
 fn softmax(inout x: BufferPtrFloat32, size: Int) -> None:
     # Find max value (for numerical stability)
-    var max_val: Float32 = x.offset(0).simd_load[1](0)
-    for i in range(size):
-        let xi = x.offset(i).simd_load[1](0)
-        if xi > max_val:
-            max_val = xi
     # Exp and sum
     var ssum: Float32 = 0.0
-    for i in range(size):
-        let xi = x.offset(i).simd_load[1](0)
-        x.offset(i).simd_store[1](0, math.exp(xi - max_val))
-        ssum += x.offset(i).simd_load[1](0)
-    # Normalize
-    for i in range(size):
-        let xi = x.offset(i).simd_load[1](0)
-        x.offset(i).simd_store[1](0, xi / ssum)
 fn matmul_parallelized(C: Matrix, A: Matrix, B: Matrix, rt: Runtime):
@@ -463,7 +594,9 @@ fn transformer(
     var x = state.x.data
     let dim = config.dim
     let hidden_dim = config.hidden_dim
-    let head_size = dim // config.n_heads
     # tmp matrix for matmul operations
     var tmpw = Matrix(0, 0)
@@ -485,37 +618,41 @@ fn transformer(
         tmpw.set_buf_ptr(weights.wq.data.offset(l * dim * dim), dim, dim)
         matmul(state.q, state.xb, tmpw, state.rt)
-        tmpw.set_buf_ptr(weights.wk.data.offset(l * dim * dim), dim, dim)
         matmul(state.k, state.xb, tmpw, state.rt)
-        tmpw.set_buf_ptr(weights.wv.data.offset(l * dim * dim), dim, dim)
         matmul(state.v, state.xb, tmpw, state.rt)
         # Apply RoPE rotation to the q and k vectors for each head
-        for h in range(config.n_heads):
-            # Get the q and k vectors for this head
-            let q = state.q.data.offset(h * head_size)
-            let k = state.k.data.offset(h * head_size)
-            # Rotate q and k by the freq_cis_real and freq_cis_imag
-            for i in range(0, head_size, 2):
-                let q0 = q.offset(i).simd_load[1](0)
-                let q1 = q.offset(i + 1).simd_load[1](0)
-                let k0 = k.offset(i).simd_load[1](0)
-                let k1 = k.offset(i + 1).simd_load[1](0)
-                let fcr = freq_cis_real_row.offset(i // 2).simd_load[1](0)
-                let fci = freq_cis_imag_row.offset(i // 2).simd_load[1](0)
-                q.offset(i).simd_store[1](0, q0 * fcr - q1 * fci)
-                q.offset(i + 1).simd_store[1](0, q0 * fci + q1 * fcr)
-                k.offset(i).simd_store[1](0, k0 * fcr - k1 * fci)
-                k.offset(i + 1).simd_store[1](0, k0 * fci + k1 * fcr)
-        # Save key,value at this time step (pos) to our kv cache
-        let loff = l * config.seq_len * dim  # kv cache layer offset for convenience
-        let key_cache_row = state.key_cache.data.offset(loff + pos * dim)
-        let value_cache_row = state.value_cache.data.offset(loff + pos * dim)
-        memcpy[DType.float32](key_cache_row, state.k.data, config.dim)
-        memcpy[DType.float32](value_cache_row, state.v.data, config.dim)
         # Multihead attention. Iterate over all heads
         for h in range(config.n_heads):
@@ -528,15 +665,17 @@ fn transformer(
             # Iterate over all timesteps, including the current one
             for t in range(pos + 1):
                 # Get the key vector for this head and at this timestep
-                let k = state.key_cache.data.offset(loff + t * dim + h * head_size)
                 # Calculate the attention score as the dot product of q and k
                 var score: Float32 = 0.0
                 for i in range(head_size):
-                    score += q.offset(i).simd_load[1](0) * k.offset(i).simd_load[1](0)
                 score /= math.sqrt[DType.float32, 1](head_size)
                 # Save the score to the attention buffer
-                att.offset(t).simd_store[1](0, score)
             # Softmax the scores to get attention weights, from 0..pos inclusively
             softmax(att, pos + 1)
@@ -546,15 +685,15 @@ fn transformer(
             memset_zero(xb, head_size)
             for t in range(pos + 1):
                 # Get the value vector for this head and at this timestep
-                let v = state.value_cache.data.offset(loff + t * dim + h * head_size)
                 # Get the attention weight for this timestep
-                let a = att.offset(t).simd_load[1](0)
                 # Accumulate the weighted value into xb
                 for i in range(head_size):
-                    let xbi = xb.offset(i).simd_load[1](0) + a * v.offset(i).simd_load[
-                        1
-                    ](0)
-                    xb.offset(i).simd_store[1](0, xbi)
         # Final matrix multiplication to get the output of the attention
         tmpw.set_buf_ptr(weights.wo.data.offset(l * dim * dim), dim, dim)
         matmul(state.xb2, state.xb, tmpw, state.rt)
@@ -616,29 +755,15 @@ fn sample(probabilities: Matrix) -> Int:
     var cdf: Float32 = 0.0
     for i in range(n):
         cdf += probabilities[i]
-        if r.simd_load[1](0) < cdf:
             return i
     return n - 1  # In case of rounding errors
-fn str_lookup(str: PointerString, tok: Tokenizer) -> Int:
-    for pos in range(tok.vocab_size):
-        let s1 = tok.vocab[pos]
-        var p1 = 0
-        while s1[p1] != 0 and str[p1] != 0:
-            if s1[p1] != str[p1]:
-                break
-            p1 += 1
-        if s1[p1] != 0 or str[p1] != 0:
-            continue
-        return pos
-    return -1
-fn bpe_encode(inout tokens: DynamicVector[Int], text: String, tok: Tokenizer):
     for pos in range(len(text)):
         let char = str_to_ptr(text[pos])
-        let id = str_lookup(char, tok)
         if id == -1:
             print("Not a good prompt token at pos ", pos)
@@ -653,7 +778,7 @@ fn bpe_encode(inout tokens: DynamicVector[Int], text: String, tok: Tokenizer):
         for i in range(len(tokens) - 1):
             # Check if we can merge the pair (tokens[i], tokens[i+1])
             let str = str_concat(tok.vocab[tokens[i]], tok.vocab[tokens[i + 1]])
-            let id = str_lookup(str, tok)
             if id != -1 and tok.vocab_scores.load(id) > best_score:
                 best_score = tok.vocab_scores.load(id)
                 best_id = id
@@ -674,7 +799,20 @@ fn bpe_encode(inout tokens: DynamicVector[Int], text: String, tok: Tokenizer):
         tokens = _tokens
 fn print_str(s: PointerString):
     # print all chars till null character
     var p: Int = 0
     while s[p].to_int() != 0:
@@ -689,7 +827,10 @@ fn time_in_ms() -> Int:
 fn print_usage():
     print("Usage: mojo llama2.mojo <checkpoint> [options]")
-    print("Example: mojo llama2.mojo stories15M.bin -s 99 -n 256 -t 0.5 -i \"Llama is an animal\"")
     print("Options:")
     print("  -s <int>    random seed, default time.now()")
     print("  -t <float>  temperature in [0,1.0], default 1.0")
@@ -718,6 +859,8 @@ fn main() raises:
                 print("Option not supported: ", args[i])
             if args[i] == "-n":
                 steps = atol(args[i + 1])
             if args[i] == "-s":
                 rng_seed = atol(args[i + 1])
             if args[i] == "-i":
@@ -748,7 +891,7 @@ fn main() raises:
     var config: Config = Config()
     read_file(checkpoint, fbuf)
-    print("checkpoint size: ", fbuf.size)
     config_init(config, fbuf)
     # negative vocab size is hacky way of signaling unshared weights. bit yikes.
@@ -759,14 +902,12 @@ fn main() raises:
     let weights: TransformerWeights = TransformerWeights(config, shared_weights, fbuf)
-    var tok: Tokenizer = Tokenizer(config.vocab_size)
     if steps <= 0 or steps > config.seq_len:
         steps = config.seq_len
     # Read in the tokenizer.bin file
     read_file(tokenizer, tbuf)
-    tokenizer_init(tok, tbuf)
     # Create and initialize the application RunState
     var state = RunState(config)
@@ -805,6 +946,9 @@ fn main() raises:
                 # Sample from this distribution to get the next token
                 next_token = sample(state.logits)
         var token_str: PointerString = tok.vocab[next_token]
         if token == 1 and token_str[0] == ord(" "):
             token_str = token_str.offset(1)
@@ -819,4 +963,4 @@ fn main() raises:
             start = time_in_ms()
     let end = time_in_ms()
-    print("\nachieved tok/s: ", (steps - 1) / (end - start) * 1000)

         self.data.simd_store[nelts](z * self.layers + y * self.cols + x, val)
+fn read_val_int(inout buf: FileBuf) raises -> Int:
     # DTypePointer[DType.ui8](buf.data).bitcast[DType.ui8]()
+    let data = buf.data.offset(buf.get_offset()).bitcast[DType.uint32]()
+    let result = data.load(0)
+    buf.move_offset(4)
     return result.to_int()
+fn read_val_float32(inout buf: FileBuf) raises -> Float32:
     # DTypePointer[DType.ui8](buf.data).bitcast[DType.ui8]()
+    let val = buf.data.offset(buf.get_offset()).bitcast[DType.float32]().load(0)
+    buf.move_offset(4)
     return val
+fn read_val_str(inout buf: FileBuf, slen: Int) raises -> PointerString:
     let str = PointerString.alloc(slen + 1)
     for i in range(slen):
+        str.store(i, buf.data.load(buf.get_offset()))
+        buf.move_offset(1)
     str.store(slen, 0)
     return str
     while s2[l2] != 0:
         l2 += 1
+    let str = PointerString.alloc(l1 + l2 + 1)
     memcpy[UInt8](str, s1, l1)
     memcpy[UInt8](str.offset(l1), s2, l2)
     str.store(l1 + l2, 0)
     return ret
+fn string_compare(a: PointerString, b: PointerString) -> Int:
+    var index = 0
+    while a[index] != 0 and b[index] != 0:
+        if a[index] < b[index]:
+            return -1
+        if a[index] > b[index]:
+            return 1
+        index += 1
+    if a[index] != 0 and b[index] == 0:
+        return 1
+    if a[index] == 0 and b[index] != 0:
+        return -1
+    return 0
+# Quicksort helper function to find the partition position
+fn partition(
+    inout array: PointerStrings, inout indices: DynamicVector[Int], low: Int, high: Int
+) -> Int:
+    let pivot = array[high]
+    var ii = low - 1
+    for jj in range(low, high):
+        if string_compare(pivot, array[jj]) == 1:
+            # If element smaller than pivot, swap
+            ii = ii + 1
+            let tmp = array[ii]
+            let tmp_idx = indices[ii]
+            array.store(ii, array[jj])
+            indices[ii] = indices[jj]
+            array.store(jj, tmp)
+            indices[jj] = tmp_idx
+    # Swap the pivot element
+    let tmp = array[ii + 1]
+    let tmp_idx = indices[ii + 1]
+    array.store(ii + 1, array[high])
+    indices[ii + 1] = indices[high]
+    array.store(high, tmp)
+    indices[high] = tmp_idx
+    return ii + 1
+fn quicksort(
+    inout array: PointerStrings, inout indices: DynamicVector[Int], low: Int, high: Int
+):
+    if low < high:
+        let pi = partition(array, indices, low, high)
+        quicksort(array, indices, low, pi - 1)
+        quicksort(array, indices, pi + 1, high)
 struct FileBuf:
     var data: BufferPtrType
     var offset: Int
         self.offset = 0
         self.size = 0
+    fn move_offset(inout self, size: Int) raises:
+        let new_offset = self.offset + size
+        if new_offset > self.size:
+            raise Error("Resulting offset will be past the end of the FileBuf")
+        if new_offset < 0:
+            raise Error("Resulting offset will be before the beginning of the FileBuf")
+        self.offset = new_offset
+    fn bitcast_offset_float32(inout self, size: Int) raises -> BufferPtrFloat32:
         let ret = self.data.offset(self.offset).bitcast[DType.float32]()
+        self.move_offset(size * sizeof[DType.float32]())
         return ret
+    fn get_offset(self) raises -> Int:
+        if self.offset > self.size:
+            raise Error("Offset is past the end of the FileBuf")
+        if self.offset < 0:
+            raise Error("Offset is before the beginning of the FileBuf")
+        return self.offset
 struct Tokenizer:
     var vocab: PointerStrings
     var vocab_scores: BufferPtrFloat32
     var max_token_length: Int
     var vocab_size: Int
+    var sorted_vocab: PointerStrings
+    var sorted_indices: DynamicVector[Int]
+    fn __init__(inout self, vocab_size: Int, inout buf: FileBuf) raises -> None:
         self.vocab_size = vocab_size
+        self.max_token_length = read_val_int(buf)
+        self.vocab_scores = BufferPtrFloat32.alloc(self.vocab_size)
+        self.vocab = PointerStrings.alloc(self.vocab_size)
+        # lazy load sorted vocab
+        self.sorted_vocab = PointerStrings.alloc(0)
+        self.sorted_indices = DynamicVector[Int](0)
+        # read vocab_scores & vocab values (tokens)
+        for i in range(0, self.vocab_size):
+            self.vocab_scores.store(i, read_val_float32(buf))
+            let slen = read_val_int(buf)
+            self.vocab.store(i, read_val_str(buf, slen))
+        return None
+    # sort vocab by string_compare
+    fn sort(inout self) -> None:
+        if len(self.sorted_indices) < self.vocab_size:
+            self.sorted_indices = DynamicVector[Int](self.vocab_size)
+            self.sorted_vocab = PointerStrings.alloc(self.vocab_size)
+            for ii in range(self.vocab_size):
+                self.sorted_vocab.store(ii, self.vocab[ii])
+                self.sorted_indices.push_back(ii)
+        let n = self.vocab_size
+        quicksort(self.sorted_vocab, self.sorted_indices, 0, n - 1)
+        return None
+    # Binary search that returns -1 if string is not found
+    fn find(inout self, token: PointerString) -> Int:
+        let n = self.vocab_size
+        if len(self.sorted_indices) < n:
+            self.sort()
+        var left = 0
+        var right = n - 1
+        while left <= right:
+            let mid = left + (right - left) // 2
+            let comparison = string_compare(self.sorted_vocab[mid], token)
+            if comparison == 0:
+                return self.sorted_indices[mid]
+            if comparison < 0:
+                left = mid + 1
+            else:
+                right = mid - 1
+        return -1
 struct Config:
     var dim: Int
+    var kv_dim: Int
     var hidden_dim: Int
     var n_layers: Int
     var n_heads: Int
     var n_kv_heads: Int
+    var kv_mul: Int
     var vocab_size: Int
     var seq_len: Int
+    var head_size: Int
     fn __init__(inout self):
         self.dim = 0
         self.n_kv_heads = 0
         self.vocab_size = 0
         self.seq_len = 0
+        self.kv_dim = 0
+        self.kv_mul = 0
+        self.head_size = 0
 struct RunState:
     var hb: Matrix  # buffer for hidden dimension in the ffn (hidden_dim,)
     var hb2: Matrix  # buffer for hidden dimension in the ffn (hidden_dim,)
     var q: Matrix  # query (dim,)
+    var k: Matrix  # key (kv_dim,)
+    var v: Matrix  # value (kv_dim,)
     var att: Matrix  # buffer for scores/attention values (n_heads, seq_len)
     var logits: Matrix  # output logits
     var key_cache: Matrix  # (layer, seq_len, dim)
         self.hb2.alloc_zero()
         self.q = Matrix(config.dim)
         self.q.alloc_zero()
+        self.k = Matrix(0, 0)
+        self.v = Matrix(0, 0)
         self.att = Matrix(config.n_heads, config.seq_len)
         self.att.alloc_zero()
         self.logits = Matrix(config.vocab_size)
         self.logits.alloc_zero()
+        self.key_cache = Matrix(config.n_layers, config.seq_len, config.kv_dim)
         self.key_cache.alloc_zero()
+        self.value_cache = Matrix(config.n_layers, config.seq_len, config.kv_dim)
         self.value_cache.alloc_zero()
         self.rt = Runtime(num_cores() // 2)
     var rms_final_weight: Matrix
     var wcls: Matrix
+    fn __init__(inout self, config: Config, shared_weights: Int, inout buf: FileBuf) raises:
         self.token_embedding_table = Matrix(config.vocab_size, config.dim)
         # set buf ptr to buf data from file
         self.token_embedding_table.set_buf_ptr(
         )
         self.wq = Matrix(config.n_layers, config.dim, config.dim)
         self.wq.set_buf_ptr(buf.bitcast_offset_float32(self.wq.size()))
+        self.wk = Matrix(config.n_layers, config.dim, config.kv_dim)
         self.wk.set_buf_ptr(buf.bitcast_offset_float32(self.wk.size()))
+        self.wv = Matrix(config.n_layers, config.dim, config.kv_dim)
         self.wv.set_buf_ptr(buf.bitcast_offset_float32(self.wv.size()))
         self.wo = Matrix(config.n_layers, config.dim, config.dim)
         self.wo.set_buf_ptr(buf.bitcast_offset_float32(self.wo.size()))
     config.n_kv_heads = read_val_int(buf)
     config.vocab_size = read_val_int(buf)
     config.seq_len = read_val_int(buf)
+    config.head_size = config.dim // config.n_heads
+    config.kv_dim = (config.n_kv_heads * config.dim) // config.n_heads
+    config.kv_mul = config.n_heads // config.n_kv_heads
     return None
 fn accum(inout a: BufferPtrFloat32, b: BufferPtrFloat32, size: Int) -> None:
+    @parameter
+    fn _acc[_nelts: Int](j: Int):
+        a.offset(j).simd_store[_nelts](
+            0, a.offset(j).simd_load[_nelts](0) + b.offset(j).simd_load[_nelts](0)
+        )
+    vectorize[nelts, _acc](size)
 fn rmsnorm(
     inout o: BufferPtrFloat32, x: BufferPtrFloat32, weight: BufferPtrFloat32, size: Int
 ) -> None:
     # Calculate sum of squares
+    var tmp = SIMD[DType.float32, nelts](0)
+    @parameter
+    fn _sum2[_nelts: Int](j: Int):
+        if _nelts < nelts:
+            tmp[0] += (x.offset(j).simd_load[_nelts](0) ** 2).reduce_add()
+        else:
+            tmp += x.offset(j).simd_load[nelts](0) ** 2
+    vectorize[nelts, _sum2](size)
+    var ss: Float32 = tmp.reduce_add()
     ss = ss / size + 1e-5
     ss = 1.0 / math.sqrt(ss)
     # Normalize and scale
+    @parameter
+    fn _norm[_nelts: Int](j: Int):
+        let val = weight.simd_load[_nelts](j) * ss * x.simd_load[_nelts](j)
+        o.offset(j).simd_store[_nelts](0, val)
+    vectorize[nelts, _norm](size)
 fn softmax(inout x: BufferPtrFloat32, size: Int) -> None:
     # Find max value (for numerical stability)
+    var max_val: Float32 = -1e9
+    @parameter
+    fn _max[_nelts: Int](j: Int):
+        let val = x.simd_load[_nelts](j).reduce_max()
+        if val > max_val:
+            max_val = val
+    vectorize[nelts, _max](size)
     # Exp and sum
     var ssum: Float32 = 0.0
+    @parameter
+    fn _sum_exp[_nelts: Int](j: Int):
+        x.simd_store[_nelts](j, math.exp(x.simd_load[_nelts](j) - max_val))
+        ssum += x.simd_load[_nelts](j).reduce_add()
+    vectorize[nelts, _sum_exp](size)
+    @parameter
+    fn _norm[_nelts: Int](j: Int):
+        x.simd_store[_nelts](j, x.simd_load[_nelts](j) / ssum)
+    vectorize[nelts, _norm](size)
 fn matmul_parallelized(C: Matrix, A: Matrix, B: Matrix, rt: Runtime):
     var x = state.x.data
     let dim = config.dim
     let hidden_dim = config.hidden_dim
+    let head_size = config.head_size
+    let kv_dim = config.kv_dim
+    let kv_mul = config.kv_mul
     # tmp matrix for matmul operations
     var tmpw = Matrix(0, 0)
         tmpw.set_buf_ptr(weights.wq.data.offset(l * dim * dim), dim, dim)
         matmul(state.q, state.xb, tmpw, state.rt)
+        let loff = l * config.seq_len * kv_dim
+        state.k.set_buf_ptr(state.key_cache.data.offset(loff + pos * kv_dim), 1, kv_dim)
+        tmpw.set_buf_ptr(weights.wk.data.offset(l * dim * kv_dim), kv_dim, dim)
         matmul(state.k, state.xb, tmpw, state.rt)
+        state.v.set_buf_ptr(
+            state.value_cache.data.offset(loff + pos * kv_dim), 1, kv_dim
+        )
+        tmpw.set_buf_ptr(weights.wv.data.offset(l * dim * kv_dim), kv_dim, dim)
         matmul(state.v, state.xb, tmpw, state.rt)
         # Apply RoPE rotation to the q and k vectors for each head
+        let q = state.q.data
+        let k = state.k.data
+        for i in range(0, head_size * config.n_kv_heads, 2):
+            let head_dim_half = i % head_size // 2
+            let fcr = freq_cis_real_row.offset(head_dim_half).load(0)
+            let fci = freq_cis_imag_row.offset(head_dim_half).load(0)
+            let q0 = q.offset(i).load(0)
+            let q1 = q.offset(i + 1).load(0)
+            let k0 = k.offset(i).load(0)
+            let k1 = k.offset(i + 1).load(0)
+            q.offset(i).store(0, q0 * fcr - q1 * fci)
+            q.offset(i + 1).store(0, q0 * fci + q1 * fcr)
+            k.offset(i).store(0, k0 * fcr - k1 * fci)
+            k.offset(i + 1).store(0, k0 * fci + k1 * fcr)
+        for i in range(head_size * config.n_kv_heads, dim, 2):
+            let head_dim_half = i % head_size // 2
+            let fcr = freq_cis_real_row.offset(head_dim_half).load(0)
+            let fci = freq_cis_imag_row.offset(head_dim_half).load(0)
+            let q0 = q.offset(i).load(0)
+            let q1 = q.offset(i + 1).load(0)
+            q.offset(i).store(0, q0 * fcr - q1 * fci)
+            q.offset(i + 1).store(0, q0 * fci + q1 * fcr)
         # Multihead attention. Iterate over all heads
         for h in range(config.n_heads):
             # Iterate over all timesteps, including the current one
             for t in range(pos + 1):
                 # Get the key vector for this head and at this timestep
+                let k = state.key_cache.data.offset(
+                    loff + t * kv_dim + (h // kv_mul) * head_size
+                )
                 # Calculate the attention score as the dot product of q and k
                 var score: Float32 = 0.0
                 for i in range(head_size):
+                    score += q.offset(i).load(0) * k.offset(i).load(0)
                 score /= math.sqrt[DType.float32, 1](head_size)
                 # Save the score to the attention buffer
+                att.offset(t).store(0, score)
             # Softmax the scores to get attention weights, from 0..pos inclusively
             softmax(att, pos + 1)
             memset_zero(xb, head_size)
             for t in range(pos + 1):
                 # Get the value vector for this head and at this timestep
+                let v = state.value_cache.data.offset(
+                    loff + t * kv_dim + (h // kv_mul) * head_size
+                )
                 # Get the attention weight for this timestep
+                let a = att.offset(t).load(0)
                 # Accumulate the weighted value into xb
                 for i in range(head_size):
+                    let xbi = xb.offset(i).load(0) + a * v.offset(i).load(0)
+                    xb.offset(i).store(0, xbi)
         # Final matrix multiplication to get the output of the attention
         tmpw.set_buf_ptr(weights.wo.data.offset(l * dim * dim), dim, dim)
         matmul(state.xb2, state.xb, tmpw, state.rt)
     var cdf: Float32 = 0.0
     for i in range(n):
         cdf += probabilities[i]
+        if r.load(0) < cdf:
             return i
     return n - 1  # In case of rounding errors
+fn bpe_encode(inout tokens: DynamicVector[Int], text: String, inout tok: Tokenizer):
     for pos in range(len(text)):
         let char = str_to_ptr(text[pos])
+        let id = tok.find(char)
         if id == -1:
             print("Not a good prompt token at pos ", pos)
         for i in range(len(tokens) - 1):
             # Check if we can merge the pair (tokens[i], tokens[i+1])
             let str = str_concat(tok.vocab[tokens[i]], tok.vocab[tokens[i + 1]])
+            let id = tok.find(str)
             if id != -1 and tok.vocab_scores.load(id) > best_score:
                 best_score = tok.vocab_scores.load(id)
                 best_id = id
         tokens = _tokens
+fn str2num(d: Int) -> Int:
+    # covert Hex to decimal
+    if d >= ord("A"):
+        return d - ord("A") + 10
+    return d - ord("0")
 fn print_str(s: PointerString):
+    # print raw byte like <0x0A>
+    if (s[1].to_int() == ord("0")) and (s[2].to_int() == ord("x")):
+        let d1: Int = s[3].to_int()
+        let d2: Int = s[4].to_int()
+        print_no_newline(chr(str2num(d1) * 16 + str2num(d2)))
+        return
     # print all chars till null character
     var p: Int = 0
     while s[p].to_int() != 0:
 fn print_usage():
     print("Usage: mojo llama2.mojo <checkpoint> [options]")
+    print(
+        'Example: mojo llama2.mojo stories15M.bin -s 99 -n 256 -t 0.5 -i "Llama is an'
+        ' animal"'
+    )
     print("Options:")
     print("  -s <int>    random seed, default time.now()")
     print("  -t <float>  temperature in [0,1.0], default 1.0")
                 print("Option not supported: ", args[i])
             if args[i] == "-n":
                 steps = atol(args[i + 1])
+            if args[i] == "-tk":
+                tokenizer = args[i + 1]
             if args[i] == "-s":
                 rng_seed = atol(args[i + 1])
             if args[i] == "-i":
     var config: Config = Config()
     read_file(checkpoint, fbuf)
+    print("checkpoint size: ", fbuf.size, "[", fbuf.size // 1024 // 1024, "MB ]")
     config_init(config, fbuf)
     # negative vocab size is hacky way of signaling unshared weights. bit yikes.
     let weights: TransformerWeights = TransformerWeights(config, shared_weights, fbuf)
     if steps <= 0 or steps > config.seq_len:
         steps = config.seq_len
     # Read in the tokenizer.bin file
     read_file(tokenizer, tbuf)
+    var tok = Tokenizer(config.vocab_size, tbuf)
     # Create and initialize the application RunState
     var state = RunState(config)
                 # Sample from this distribution to get the next token
                 next_token = sample(state.logits)
+            # Finish generating when EOS, BOS appear
+            if next_token == 1 or next_token == 2:
+                break
         var token_str: PointerString = tok.vocab[next_token]
         if token == 1 and token_str[0] == ord(" "):
             token_str = token_str.offset(1)
             start = time_in_ms()
     let end = time_in_ms()
+    print("\nachieved tok/s: ", (pos - 1) / (end - start) * 1000)

t260.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:037cb335abb25d1fa9e8ecae30ed2a3a8ace9302862ebcdc05d51a6bbb10c312
+size 6227