#include "ggml/ggml.h" #include "common-ggml.h" #include "common.h" #include #include #include #include #include #include #include #include #include #include #include struct btlm_vocab { using id = int32_t; using token = std::string; std::map token_to_id; std::map id_to_token; std::vector special_tokens; }; struct btlm_params { int32_t seed = -1; // RNG seed int32_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency()); int32_t n_predict = 200; // new tokens to predict int32_t n_batch = 8; // batch size for prompt processing // sampling parameters int32_t top_k = 40; float top_p = 0.9f; float temp = 0.9f; int32_t repeat_last_n = 64; float repeat_penalty = 1.00f; std::string model = "/home/madman/Desktop/ml_play/ml_models/btlm-3b.ggml.bin"; // model path std::string prompt = "Capital of Nepal is"; std::string token_test = ""; }; struct btlm_hparams { int32_t n_vocab; int32_t n_ctx; int32_t n_embd; int32_t n_head; int32_t n_layer; int32_t n_inner; int32_t ftype; }; struct btlm_layer { // normalization struct ggml_tensor *ln_1_g; struct ggml_tensor *ln_1_b; struct ggml_tensor *ln_2_g; struct ggml_tensor *ln_2_b; // attention struct ggml_tensor *c_attn_attn_w; struct ggml_tensor *c_attn_attn_b; struct ggml_tensor *c_attn_attn_scb; struct ggml_tensor *c_attn_proj_w; struct ggml_tensor *c_attn_proj_b; struct ggml_tensor *c_attn_proj_scb; // mlp struct ggml_tensor *c_mlp_fc_w; struct ggml_tensor *c_mlp_fc_b; struct ggml_tensor *c_mlp_fc_scb; struct ggml_tensor *c_mlp_fc2_w; struct ggml_tensor *c_mlp_fc2_b; struct ggml_tensor *c_mlp_fc2_scb; struct ggml_tensor *c_mlp_proj_w; struct ggml_tensor *c_mlp_proj_b; struct ggml_tensor *c_mlp_proj_scb; }; struct btlm_model { btlm_hparams hparams; // normalization struct ggml_tensor *ln_f_g; struct ggml_tensor *ln_f_b; struct ggml_tensor *wte; // position embedding struct ggml_tensor *alibi_slopes; struct ggml_tensor *lm_head; // language model head std::vector layers; // key + value memory struct ggml_tensor *memory_k; struct ggml_tensor *memory_v; // struct ggml_context *ctx; std::map tensors; }; // load the model's weights from a file bool btlm_model_load(const std::string &fname, btlm_model &model, btlm_vocab &vocab) { printf("%s: loading model from '%s'\n", __func__, fname.c_str()); auto fin = std::ifstream(fname, std::ios::binary); if (!fin) { fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); return false; } // verify magic { uint32_t magic; fin.read((char *)&magic, sizeof(magic)); if (magic != GGML_FILE_MAGIC) { fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); return false; } } // load hparams { auto &hparams = model.hparams; fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab)); fin.read((char *)&hparams.n_ctx, sizeof(hparams.n_ctx)); fin.read((char *)&hparams.n_embd, sizeof(hparams.n_embd)); fin.read((char *)&hparams.n_head, sizeof(hparams.n_head)); fin.read((char *)&hparams.n_layer, sizeof(hparams.n_layer)); fin.read((char *)&hparams.n_inner, sizeof(hparams.n_inner)); fin.read((char *)&hparams.ftype, sizeof(hparams.ftype)); const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_embd = %d\n", __func__, hparams.n_embd); printf("%s: n_head = %d\n", __func__, hparams.n_head); printf("%s: n_layer = %d\n", __func__, hparams.n_layer); printf("%s: n_inner = %d\n", __func__, hparams.n_inner); printf("%s: ftype = %d\n", __func__, hparams.ftype); printf("%s: qntvr = %d\n", __func__, qntvr); hparams.ftype %= GGML_QNT_VERSION_FACTOR; } // for the big tensors, we have the option to store the data in 16-bit floats // or quantized in order to save memory and also to speed up the computation ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model.hparams.ftype)); if (wtype == GGML_TYPE_COUNT) { fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname.c_str(), model.hparams.ftype); return false; } auto &ctx = model.ctx; size_t ctx_size = 0; { ctx_size = 9000000000; // fixme => actually calculate this printf("%s: ggml tensor size = %d bytes\n", __func__, (int)sizeof(ggml_tensor)); printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size / (1024.0 * 1024.0)); printf("%s: ggml ctx size = %d \n", __func__, ctx_size); } // create the ggml context { struct ggml_init_params params = { /*.mem_size =*/ctx_size, /*.mem_buffer =*/NULL, /*.no_alloc =*/false, }; model.ctx = ggml_init(params); if (!model.ctx) { fprintf(stderr, "%s: ggml_init() failed\n", __func__); return false; } } // load vocab { int32_t n_vocab = model.hparams.n_vocab; std::string word; std::vector buf(128); for (int i = 0; i < n_vocab; i++) { uint32_t len; fin.read((char *)&len, sizeof(len)); buf.resize(len); fin.read((char *)buf.data(), len); word.assign(buf.data(), len); // printf("%s \n", word.c_str()); vocab.token_to_id[word] = i; vocab.id_to_token[i] = word; } } { // alloc memory const auto &hparams = model.hparams; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; // const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; model.layers.resize(n_layer); model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd); model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd); model.wte = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, n_vocab); model.lm_head = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, n_vocab); model.alibi_slopes = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 32); // map by name model.tensors["model/ln_f/g"] = model.ln_f_g; model.tensors["model/ln_f/b"] = model.ln_f_b; model.tensors["model/wte"] = model.wte; model.tensors["model/lm_head"] = model.lm_head; model.tensors["model/relative_pe/slopes"] = model.alibi_slopes; for (int i = 0; i < n_layer; ++i) { auto &layer = model.layers[i]; layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd); layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd); layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd); layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd); layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 3 * n_embd, n_embd ); layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 3 * n_embd); layer.c_attn_attn_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 3 * n_embd); layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, n_embd); layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd); layer.c_attn_proj_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd); layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 6832, n_embd); layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826); layer.c_mlp_fc_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826); layer.c_mlp_fc2_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, 6832 ); layer.c_mlp_fc2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826); layer.c_mlp_fc2_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826); layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, 6848); layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd); layer.c_mlp_proj_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd); // map by name model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g; model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b; model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g; model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b; model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w; model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b; model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/scb"] = layer.c_attn_attn_scb; model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w; model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b; model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/scb"] = layer.c_attn_proj_scb; model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w; model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b; model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/scb"] = layer.c_mlp_fc_scb; model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc2/w"] = layer.c_mlp_fc2_w; model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc2/b"] = layer.c_mlp_fc2_b; model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc2/scb"] = layer.c_mlp_fc2_scb; model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w; model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b; model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/scb"] = layer.c_mlp_proj_scb; } } // load weights { size_t total_size = 0; bool has_lm_head = false; while (true) { int32_t n_dims; int32_t length; int32_t ttype; fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); fin.read(reinterpret_cast(&length), sizeof(length)); fin.read(reinterpret_cast(&ttype), sizeof(ttype)); if (fin.eof()) { break; } int32_t nelements = 1; int32_t ne[2] = {1, 1}; for (int i = 0; i < n_dims; ++i) { fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); nelements *= ne[i]; } std::string name(length, 0); fin.read(&name[0], length); printf("processing tensor '%s' in model file\n", name.data()); if (model.tensors.find(name.data()) == model.tensors.end()) { fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); return false; } auto tensor = model.tensors[name.data()]; if (ggml_nelements(tensor) != nelements) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); return false; } if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], " "expected [%d, %d]\n", __func__, name.data(), (int)tensor->ne[0], (int)tensor->ne[1], ne[0], ne[1]); return false; } // for debugging if (1) { printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor) / 1024.0 / 1024.0, ggml_nbytes(tensor)); } const size_t bpe = ggml_type_size(ggml_type(ttype)); if ((nelements * bpe) / ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, " "expected %zu\n", __func__, name.data(), ggml_nbytes(tensor), nelements * bpe); return false; } fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); total_size += ggml_nbytes(tensor); } printf("%s: model size = %8.2f MB\n", __func__, total_size / 1024.0 / 1024.0); } fin.close(); return true; } int main(int argc, char **argv) { btlm_params params; btlm_model models; btlm_vocab vocab; btlm_model_load(params.model, models, vocab); return 0; }