File size: 12,683 Bytes

#include "ggml/ggml.h"

#include "common-ggml.h"
#include "common.h"

#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <stdint.h>
#include <string>
#include <vector>

struct btlm_vocab {
  using id = int32_t;
  using token = std::string;

  std::map<token, id> token_to_id;
  std::map<id, token> id_to_token;
  std::vector<std::string> special_tokens;
};

struct btlm_params {
  int32_t seed = -1; // RNG seed
  int32_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency());
  int32_t n_predict = 200; // new tokens to predict
  int32_t n_batch = 8;     // batch size for prompt processing

  // sampling parameters
  int32_t top_k = 40;
  float top_p = 0.9f;
  float temp = 0.9f;
  int32_t repeat_last_n = 64;
  float repeat_penalty = 1.00f;

  std::string model =
      "/home/madman/Desktop/ml_play/ml_models/btlm-3b.ggml.bin"; // model path
  std::string prompt = "Capital of Nepal is";
  std::string token_test = "";
};

struct btlm_hparams {
  int32_t n_vocab;
  int32_t n_ctx;
  int32_t n_embd;
  int32_t n_head;
  int32_t n_layer;
  int32_t n_inner;
  int32_t ftype;
};

struct btlm_layer {
  // normalization
  struct ggml_tensor *ln_1_g;
  struct ggml_tensor *ln_1_b;

  struct ggml_tensor *ln_2_g;
  struct ggml_tensor *ln_2_b;

  // attention
  struct ggml_tensor *c_attn_attn_w;
  struct ggml_tensor *c_attn_attn_b;
  struct ggml_tensor *c_attn_attn_scb;

  struct ggml_tensor *c_attn_proj_w;
  struct ggml_tensor *c_attn_proj_b;
  struct ggml_tensor *c_attn_proj_scb;

  // mlp
  struct ggml_tensor *c_mlp_fc_w;
  struct ggml_tensor *c_mlp_fc_b;
  struct ggml_tensor *c_mlp_fc_scb;

  struct ggml_tensor *c_mlp_fc2_w;
  struct ggml_tensor *c_mlp_fc2_b;
  struct ggml_tensor *c_mlp_fc2_scb;

  struct ggml_tensor *c_mlp_proj_w;
  struct ggml_tensor *c_mlp_proj_b;
  struct ggml_tensor *c_mlp_proj_scb;
};

struct btlm_model {
  btlm_hparams hparams;

  // normalization
  struct ggml_tensor *ln_f_g;
  struct ggml_tensor *ln_f_b;

  struct ggml_tensor *wte; // position embedding
  struct ggml_tensor *alibi_slopes;
  struct ggml_tensor *lm_head; // language model head

  std::vector<btlm_layer> layers;

  // key + value memory
  struct ggml_tensor *memory_k;
  struct ggml_tensor *memory_v;

  //
  struct ggml_context *ctx;
  std::map<std::string, struct ggml_tensor *> tensors;
};

// load the model's weights from a file
bool btlm_model_load(const std::string &fname, btlm_model &model,
                     btlm_vocab &vocab) {
  printf("%s: loading model from '%s'\n", __func__, fname.c_str());

  auto fin = std::ifstream(fname, std::ios::binary);
  if (!fin) {
    fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
    return false;
  }

  // verify magic
  {
    uint32_t magic;
    fin.read((char *)&magic, sizeof(magic));
    if (magic != GGML_FILE_MAGIC) {
      fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__,
              fname.c_str());
      return false;
    }
  }

  // load hparams
  {
    auto &hparams = model.hparams;

    fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
    fin.read((char *)&hparams.n_ctx, sizeof(hparams.n_ctx));
    fin.read((char *)&hparams.n_embd, sizeof(hparams.n_embd));
    fin.read((char *)&hparams.n_head, sizeof(hparams.n_head));
    fin.read((char *)&hparams.n_layer, sizeof(hparams.n_layer));
    fin.read((char *)&hparams.n_inner, sizeof(hparams.n_inner));
    fin.read((char *)&hparams.ftype, sizeof(hparams.ftype));

    const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;

    printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
    printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
    printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
    printf("%s: n_head  = %d\n", __func__, hparams.n_head);
    printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
    printf("%s: n_inner = %d\n", __func__, hparams.n_inner);
    printf("%s: ftype   = %d\n", __func__, hparams.ftype);
    printf("%s: qntvr   = %d\n", __func__, qntvr);

    hparams.ftype %= GGML_QNT_VERSION_FACTOR;
  }

  // for the big tensors, we have the option to store the data in 16-bit floats
  // or quantized in order to save memory and also to speed up the computation
  ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model.hparams.ftype));
  if (wtype == GGML_TYPE_COUNT) {
    fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
            __func__, fname.c_str(), model.hparams.ftype);
    return false;
  }

  auto &ctx = model.ctx;
  size_t ctx_size = 0;

  {

    ctx_size = 9000000000; // fixme => actually calculate this

    printf("%s: ggml tensor size = %d bytes\n", __func__,
           (int)sizeof(ggml_tensor));
    printf("%s: ggml ctx size = %6.2f MB\n", __func__,
           ctx_size / (1024.0 * 1024.0));
    printf("%s: ggml ctx size = %d \n", __func__, ctx_size);
  }

  // create the ggml context
  {
    struct ggml_init_params params = {
        /*.mem_size   =*/ctx_size,
        /*.mem_buffer =*/NULL,
        /*.no_alloc   =*/false,
    };

    model.ctx = ggml_init(params);
    if (!model.ctx) {
      fprintf(stderr, "%s: ggml_init() failed\n", __func__);
      return false;
    }
  }

  // load vocab
  {
    int32_t n_vocab = model.hparams.n_vocab;

    std::string word;
    std::vector<char> buf(128);

    for (int i = 0; i < n_vocab; i++) {
      uint32_t len;
      fin.read((char *)&len, sizeof(len));

      buf.resize(len);
      fin.read((char *)buf.data(), len);
      word.assign(buf.data(), len);

      // printf("%s \n", word.c_str());

      vocab.token_to_id[word] = i;
      vocab.id_to_token[i] = word;
    }
  }

  {

    // alloc memory

    const auto &hparams = model.hparams;

    const int n_embd = hparams.n_embd;
    const int n_layer = hparams.n_layer;
    //   const int n_ctx = hparams.n_ctx;
    const int n_vocab = hparams.n_vocab;

    model.layers.resize(n_layer);

    model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
    model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
    model.wte = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, n_vocab);
    model.lm_head = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, n_vocab);
    model.alibi_slopes = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 32);

    // map by name
    model.tensors["model/ln_f/g"] = model.ln_f_g;
    model.tensors["model/ln_f/b"] = model.ln_f_b;
    model.tensors["model/wte"] = model.wte;
    model.tensors["model/lm_head"] = model.lm_head;
    model.tensors["model/relative_pe/slopes"] = model.alibi_slopes;

    for (int i = 0; i < n_layer; ++i) {
      auto &layer = model.layers[i];

      layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
      layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);

      layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
      layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);

      layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 3 * n_embd, n_embd );
      layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 3 * n_embd);
      layer.c_attn_attn_scb =
          ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 3 * n_embd);

      layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, n_embd);
      layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
      layer.c_attn_proj_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);

      layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 6832, n_embd);
      layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826);
      layer.c_mlp_fc_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826);

      layer.c_mlp_fc2_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, 6832 );
      layer.c_mlp_fc2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826);
      layer.c_mlp_fc2_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826);

      layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, 6848);
      layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
      layer.c_mlp_proj_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);

      // map by name
      model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
      model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;

      model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
      model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;

      model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
      model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
     model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/scb"] = layer.c_attn_attn_scb;


      model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] =
          layer.c_attn_proj_w;
      model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] =
          layer.c_attn_proj_b;
      model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/scb"] =
          layer.c_attn_proj_scb;

      model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] =
          layer.c_mlp_fc_w;
      model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] =
          layer.c_mlp_fc_b;
      model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/scb"] =
          layer.c_mlp_fc_scb;

      model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc2/w"] =
          layer.c_mlp_fc2_w;
      model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc2/b"] =
          layer.c_mlp_fc2_b;
      model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc2/scb"] =
          layer.c_mlp_fc2_scb;


      model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] =
          layer.c_mlp_proj_w;
      model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] =
          layer.c_mlp_proj_b;
      model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/scb"] =
          layer.c_mlp_proj_scb;
    }
  }

  // load weights
  {
    size_t total_size = 0;

    bool has_lm_head = false;

    while (true) {
      int32_t n_dims;
      int32_t length;
      int32_t ttype;

      fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
      fin.read(reinterpret_cast<char *>(&length), sizeof(length));
      fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));

      if (fin.eof()) {
        break;
      }

      int32_t nelements = 1;
      int32_t ne[2] = {1, 1};
      for (int i = 0; i < n_dims; ++i) {
        fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
        nelements *= ne[i];
      }

      std::string name(length, 0);
      fin.read(&name[0], length);

      printf("processing tensor '%s' in model file\n", name.data());

      if (model.tensors.find(name.data()) == model.tensors.end()) {
        fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__,
                name.data());
        return false;
      }

      auto tensor = model.tensors[name.data()];
      if (ggml_nelements(tensor) != nelements) {
        fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n",
                __func__, name.data());
        return false;
      }

      if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
        fprintf(stderr,
                "%s: tensor '%s' has wrong shape in model file: got [%d, %d], "
                "expected [%d, %d]\n",
                __func__, name.data(), (int)tensor->ne[0], (int)tensor->ne[1],
                ne[0], ne[1]);
        return false;
      }

      // for debugging
      if (1) {
        printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n",
               name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)),
               ggml_nbytes(tensor) / 1024.0 / 1024.0, ggml_nbytes(tensor));
      }

      const size_t bpe = ggml_type_size(ggml_type(ttype));

      if ((nelements * bpe) / ggml_blck_size(tensor->type) !=
          ggml_nbytes(tensor)) {
        fprintf(stderr,
                "%s: tensor '%s' has wrong size in model file: got %zu, "
                "expected %zu\n",
                __func__, name.data(), ggml_nbytes(tensor), nelements * bpe);
        return false;
      }

      fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));


      total_size += ggml_nbytes(tensor);
    }

    printf("%s: model size  = %8.2f MB\n", __func__,
           total_size / 1024.0 / 1024.0);
  }

  fin.close();

  return true;
}

int main(int argc, char **argv) {
  btlm_params params;
  btlm_model models;
  btlm_vocab vocab;

  btlm_model_load(params.model, models, vocab);

  return 0;
}