btlm-3b-ggml / btlm_model_wip.cpp
bornjre's picture
update ctx size
c3979ad
#include "ggml/ggml.h"
#include "common-ggml.h"
#include "common.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <stdint.h>
#include <string>
#include <vector>
struct btlm_vocab {
using id = int32_t;
using token = std::string;
std::map<token, id> token_to_id;
std::map<id, token> id_to_token;
std::vector<std::string> special_tokens;
};
struct btlm_params {
int32_t seed = -1; // RNG seed
int32_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency());
int32_t n_predict = 200; // new tokens to predict
int32_t n_batch = 8; // batch size for prompt processing
// sampling parameters
int32_t top_k = 40;
float top_p = 0.9f;
float temp = 0.9f;
int32_t repeat_last_n = 64;
float repeat_penalty = 1.00f;
std::string model =
"/home/madman/Desktop/ml_play/ml_models/btlm-3b.ggml.bin"; // model path
std::string prompt = "Capital of Nepal is";
std::string token_test = "";
};
struct btlm_hparams {
int32_t n_vocab;
int32_t n_ctx;
int32_t n_embd;
int32_t n_head;
int32_t n_layer;
int32_t n_inner;
int32_t ftype;
};
struct btlm_layer {
// normalization
struct ggml_tensor *ln_1_g;
struct ggml_tensor *ln_1_b;
struct ggml_tensor *ln_2_g;
struct ggml_tensor *ln_2_b;
// attention
struct ggml_tensor *c_attn_attn_w;
struct ggml_tensor *c_attn_attn_b;
struct ggml_tensor *c_attn_attn_scb;
struct ggml_tensor *c_attn_proj_w;
struct ggml_tensor *c_attn_proj_b;
struct ggml_tensor *c_attn_proj_scb;
// mlp
struct ggml_tensor *c_mlp_fc_w;
struct ggml_tensor *c_mlp_fc_b;
struct ggml_tensor *c_mlp_fc_scb;
struct ggml_tensor *c_mlp_fc2_w;
struct ggml_tensor *c_mlp_fc2_b;
struct ggml_tensor *c_mlp_fc2_scb;
struct ggml_tensor *c_mlp_proj_w;
struct ggml_tensor *c_mlp_proj_b;
struct ggml_tensor *c_mlp_proj_scb;
};
struct btlm_model {
btlm_hparams hparams;
// normalization
struct ggml_tensor *ln_f_g;
struct ggml_tensor *ln_f_b;
struct ggml_tensor *wte; // position embedding
struct ggml_tensor *alibi_slopes;
struct ggml_tensor *lm_head; // language model head
std::vector<btlm_layer> layers;
// key + value memory
struct ggml_tensor *memory_k;
struct ggml_tensor *memory_v;
//
struct ggml_context *ctx;
std::map<std::string, struct ggml_tensor *> tensors;
};
// load the model's weights from a file
bool btlm_model_load(const std::string &fname, btlm_model &model,
btlm_vocab &vocab) {
printf("%s: loading model from '%s'\n", __func__, fname.c_str());
auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) {
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
return false;
}
// verify magic
{
uint32_t magic;
fin.read((char *)&magic, sizeof(magic));
if (magic != GGML_FILE_MAGIC) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__,
fname.c_str());
return false;
}
}
// load hparams
{
auto &hparams = model.hparams;
fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
fin.read((char *)&hparams.n_ctx, sizeof(hparams.n_ctx));
fin.read((char *)&hparams.n_embd, sizeof(hparams.n_embd));
fin.read((char *)&hparams.n_head, sizeof(hparams.n_head));
fin.read((char *)&hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *)&hparams.n_inner, sizeof(hparams.n_inner));
fin.read((char *)&hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: n_inner = %d\n", __func__, hparams.n_inner);
printf("%s: ftype = %d\n", __func__, hparams.ftype);
printf("%s: qntvr = %d\n", __func__, qntvr);
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
}
// for the big tensors, we have the option to store the data in 16-bit floats
// or quantized in order to save memory and also to speed up the computation
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model.hparams.ftype));
if (wtype == GGML_TYPE_COUNT) {
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
__func__, fname.c_str(), model.hparams.ftype);
return false;
}
auto &ctx = model.ctx;
size_t ctx_size = 0;
{
ctx_size = 9000000000; // fixme => actually calculate this
printf("%s: ggml tensor size = %d bytes\n", __func__,
(int)sizeof(ggml_tensor));
printf("%s: ggml ctx size = %6.2f MB\n", __func__,
ctx_size / (1024.0 * 1024.0));
printf("%s: ggml ctx size = %d \n", __func__, ctx_size);
}
// create the ggml context
{
struct ggml_init_params params = {
/*.mem_size =*/ctx_size,
/*.mem_buffer =*/NULL,
/*.no_alloc =*/false,
};
model.ctx = ggml_init(params);
if (!model.ctx) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
}
}
// load vocab
{
int32_t n_vocab = model.hparams.n_vocab;
std::string word;
std::vector<char> buf(128);
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
fin.read((char *)&len, sizeof(len));
buf.resize(len);
fin.read((char *)buf.data(), len);
word.assign(buf.data(), len);
// printf("%s \n", word.c_str());
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
}
}
{
// alloc memory
const auto &hparams = model.hparams;
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
// const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;
model.layers.resize(n_layer);
model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
model.wte = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, n_vocab);
model.lm_head = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, n_vocab);
model.alibi_slopes = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 32);
// map by name
model.tensors["model/ln_f/g"] = model.ln_f_g;
model.tensors["model/ln_f/b"] = model.ln_f_b;
model.tensors["model/wte"] = model.wte;
model.tensors["model/lm_head"] = model.lm_head;
model.tensors["model/relative_pe/slopes"] = model.alibi_slopes;
for (int i = 0; i < n_layer; ++i) {
auto &layer = model.layers[i];
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 3 * n_embd, n_embd );
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 3 * n_embd);
layer.c_attn_attn_scb =
ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 3 * n_embd);
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, n_embd);
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
layer.c_attn_proj_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 6832, n_embd);
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826);
layer.c_mlp_fc_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826);
layer.c_mlp_fc2_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, 6832 );
layer.c_mlp_fc2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826);
layer.c_mlp_fc2_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826);
layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, 6848);
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
layer.c_mlp_proj_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
// map by name
model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/scb"] = layer.c_attn_attn_scb;
model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] =
layer.c_attn_proj_w;
model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] =
layer.c_attn_proj_b;
model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/scb"] =
layer.c_attn_proj_scb;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] =
layer.c_mlp_fc_w;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] =
layer.c_mlp_fc_b;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/scb"] =
layer.c_mlp_fc_scb;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc2/w"] =
layer.c_mlp_fc2_w;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc2/b"] =
layer.c_mlp_fc2_b;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc2/scb"] =
layer.c_mlp_fc2_scb;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] =
layer.c_mlp_proj_w;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] =
layer.c_mlp_proj_b;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/scb"] =
layer.c_mlp_proj_scb;
}
}
// load weights
{
size_t total_size = 0;
bool has_lm_head = false;
while (true) {
int32_t n_dims;
int32_t length;
int32_t ttype;
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
if (fin.eof()) {
break;
}
int32_t nelements = 1;
int32_t ne[2] = {1, 1};
for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
std::string name(length, 0);
fin.read(&name[0], length);
printf("processing tensor '%s' in model file\n", name.data());
if (model.tensors.find(name.data()) == model.tensors.end()) {
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__,
name.data());
return false;
}
auto tensor = model.tensors[name.data()];
if (ggml_nelements(tensor) != nelements) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n",
__func__, name.data());
return false;
}
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
fprintf(stderr,
"%s: tensor '%s' has wrong shape in model file: got [%d, %d], "
"expected [%d, %d]\n",
__func__, name.data(), (int)tensor->ne[0], (int)tensor->ne[1],
ne[0], ne[1]);
return false;
}
// for debugging
if (1) {
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n",
name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)),
ggml_nbytes(tensor) / 1024.0 / 1024.0, ggml_nbytes(tensor));
}
const size_t bpe = ggml_type_size(ggml_type(ttype));
if ((nelements * bpe) / ggml_blck_size(tensor->type) !=
ggml_nbytes(tensor)) {
fprintf(stderr,
"%s: tensor '%s' has wrong size in model file: got %zu, "
"expected %zu\n",
__func__, name.data(), ggml_nbytes(tensor), nelements * bpe);
return false;
}
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
total_size += ggml_nbytes(tensor);
}
printf("%s: model size = %8.2f MB\n", __func__,
total_size / 1024.0 / 1024.0);
}
fin.close();
return true;
}
int main(int argc, char **argv) {
btlm_params params;
btlm_model models;
btlm_vocab vocab;
btlm_model_load(params.model, models, vocab);
return 0;
}