btlm-3b-ggml / btlm_model_wip.cpp

update ctx size

c3979ad about 1 year ago

No virus

12.7 kB

	#include "ggml/ggml.h"

	#include "common-ggml.h"
	#include "common.h"

	#include <cassert>
	#include <cinttypes>
	#include <cmath>
	#include <cstdio>
	#include <cstring>
	#include <fstream>
	#include <iostream>
	#include <map>
	#include <stdint.h>
	#include <string>
	#include <vector>

	struct btlm_vocab {
	using id = int32_t;
	using token = std::string;

	std::map<token, id> token_to_id;
	std::map<id, token> id_to_token;
	std::vector<std::string> special_tokens;
	};

	struct btlm_params {
	int32_t seed = -1; // RNG seed
	int32_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency());
	int32_t n_predict = 200; // new tokens to predict
	int32_t n_batch = 8; // batch size for prompt processing

	// sampling parameters
	int32_t top_k = 40;
	float top_p = 0.9f;
	float temp = 0.9f;
	int32_t repeat_last_n = 64;
	float repeat_penalty = 1.00f;

	std::string model =
	"/home/madman/Desktop/ml_play/ml_models/btlm-3b.ggml.bin"; // model path
	std::string prompt = "Capital of Nepal is";
	std::string token_test = "";
	};

	struct btlm_hparams {
	int32_t n_vocab;
	int32_t n_ctx;
	int32_t n_embd;
	int32_t n_head;
	int32_t n_layer;
	int32_t n_inner;
	int32_t ftype;
	};

	struct btlm_layer {
	// normalization
	struct ggml_tensor *ln_1_g;
	struct ggml_tensor *ln_1_b;

	struct ggml_tensor *ln_2_g;
	struct ggml_tensor *ln_2_b;

	// attention
	struct ggml_tensor *c_attn_attn_w;
	struct ggml_tensor *c_attn_attn_b;
	struct ggml_tensor *c_attn_attn_scb;

	struct ggml_tensor *c_attn_proj_w;
	struct ggml_tensor *c_attn_proj_b;
	struct ggml_tensor *c_attn_proj_scb;

	// mlp
	struct ggml_tensor *c_mlp_fc_w;
	struct ggml_tensor *c_mlp_fc_b;
	struct ggml_tensor *c_mlp_fc_scb;

	struct ggml_tensor *c_mlp_fc2_w;
	struct ggml_tensor *c_mlp_fc2_b;
	struct ggml_tensor *c_mlp_fc2_scb;

	struct ggml_tensor *c_mlp_proj_w;
	struct ggml_tensor *c_mlp_proj_b;
	struct ggml_tensor *c_mlp_proj_scb;
	};

	struct btlm_model {
	btlm_hparams hparams;

	// normalization
	struct ggml_tensor *ln_f_g;
	struct ggml_tensor *ln_f_b;

	struct ggml_tensor *wte; // position embedding
	struct ggml_tensor *alibi_slopes;
	struct ggml_tensor *lm_head; // language model head

	std::vector<btlm_layer> layers;

	// key + value memory
	struct ggml_tensor *memory_k;
	struct ggml_tensor *memory_v;

	//
	struct ggml_context *ctx;
	std::map<std::string, struct ggml_tensor *> tensors;
	};

	// load the model's weights from a file
	bool btlm_model_load(const std::string &fname, btlm_model &model,
	btlm_vocab &vocab) {
	printf("%s: loading model from '%s'\n", __func__, fname.c_str());

	auto fin = std::ifstream(fname, std::ios::binary);
	if (!fin) {
	fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
	return false;
	}

	// verify magic
	{
	uint32_t magic;
	fin.read((char *)&magic, sizeof(magic));
	if (magic != GGML_FILE_MAGIC) {
	fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__,
	fname.c_str());
	return false;
	}
	}

	// load hparams
	{
	auto &hparams = model.hparams;

	fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
	fin.read((char *)&hparams.n_ctx, sizeof(hparams.n_ctx));
	fin.read((char *)&hparams.n_embd, sizeof(hparams.n_embd));
	fin.read((char *)&hparams.n_head, sizeof(hparams.n_head));
	fin.read((char *)&hparams.n_layer, sizeof(hparams.n_layer));
	fin.read((char *)&hparams.n_inner, sizeof(hparams.n_inner));
	fin.read((char *)&hparams.ftype, sizeof(hparams.ftype));

	const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;

	printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
	printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
	printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
	printf("%s: n_head = %d\n", __func__, hparams.n_head);
	printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
	printf("%s: n_inner = %d\n", __func__, hparams.n_inner);
	printf("%s: ftype = %d\n", __func__, hparams.ftype);
	printf("%s: qntvr = %d\n", __func__, qntvr);

	hparams.ftype %= GGML_QNT_VERSION_FACTOR;
	}

	// for the big tensors, we have the option to store the data in 16-bit floats
	// or quantized in order to save memory and also to speed up the computation
	ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model.hparams.ftype));
	if (wtype == GGML_TYPE_COUNT) {
	fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
	__func__, fname.c_str(), model.hparams.ftype);
	return false;
	}

	auto &ctx = model.ctx;
	size_t ctx_size = 0;

	{

	ctx_size = 9000000000; // fixme => actually calculate this

	printf("%s: ggml tensor size = %d bytes\n", __func__,
	(int)sizeof(ggml_tensor));
	printf("%s: ggml ctx size = %6.2f MB\n", __func__,
	ctx_size / (1024.0 * 1024.0));
	printf("%s: ggml ctx size = %d \n", __func__, ctx_size);
	}

	// create the ggml context
	{
	struct ggml_init_params params = {
	/.mem_size =/ctx_size,
	/.mem_buffer =/NULL,
	/.no_alloc =/false,
	};

	model.ctx = ggml_init(params);
	if (!model.ctx) {
	fprintf(stderr, "%s: ggml_init() failed\n", __func__);
	return false;
	}
	}

	// load vocab
	{
	int32_t n_vocab = model.hparams.n_vocab;

	std::string word;
	std::vector<char> buf(128);

	for (int i = 0; i < n_vocab; i++) {
	uint32_t len;
	fin.read((char *)&len, sizeof(len));

	buf.resize(len);
	fin.read((char *)buf.data(), len);
	word.assign(buf.data(), len);

	// printf("%s \n", word.c_str());

	vocab.token_to_id[word] = i;
	vocab.id_to_token[i] = word;
	}
	}

	{

	// alloc memory

	const auto &hparams = model.hparams;

	const int n_embd = hparams.n_embd;
	const int n_layer = hparams.n_layer;
	// const int n_ctx = hparams.n_ctx;
	const int n_vocab = hparams.n_vocab;

	model.layers.resize(n_layer);

	model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
	model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
	model.wte = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, n_vocab);
	model.lm_head = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, n_vocab);
	model.alibi_slopes = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 32);

	// map by name
	model.tensors["model/ln_f/g"] = model.ln_f_g;
	model.tensors["model/ln_f/b"] = model.ln_f_b;
	model.tensors["model/wte"] = model.wte;
	model.tensors["model/lm_head"] = model.lm_head;
	model.tensors["model/relative_pe/slopes"] = model.alibi_slopes;

	for (int i = 0; i < n_layer; ++i) {
	auto &layer = model.layers[i];

	layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
	layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);

	layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
	layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);

	layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 3 * n_embd, n_embd );
	layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 3 * n_embd);
	layer.c_attn_attn_scb =
	ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 3 * n_embd);

	layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, n_embd);
	layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
	layer.c_attn_proj_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);

	layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 6832, n_embd);
	layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826);
	layer.c_mlp_fc_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826);

	layer.c_mlp_fc2_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, 6832 );
	layer.c_mlp_fc2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826);
	layer.c_mlp_fc2_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 6826);

	layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_embd, 6848);
	layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);
	layer.c_mlp_proj_scb = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd);

	// map by name
	model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
	model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;

	model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
	model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;

	model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
	model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
	model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/scb"] = layer.c_attn_attn_scb;


	model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] =
	layer.c_attn_proj_w;
	model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] =
	layer.c_attn_proj_b;
	model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/scb"] =
	layer.c_attn_proj_scb;

	model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] =
	layer.c_mlp_fc_w;
	model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] =
	layer.c_mlp_fc_b;
	model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/scb"] =
	layer.c_mlp_fc_scb;

	model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc2/w"] =
	layer.c_mlp_fc2_w;
	model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc2/b"] =
	layer.c_mlp_fc2_b;
	model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc2/scb"] =
	layer.c_mlp_fc2_scb;


	model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] =
	layer.c_mlp_proj_w;
	model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] =
	layer.c_mlp_proj_b;
	model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/scb"] =
	layer.c_mlp_proj_scb;
	}
	}

	// load weights
	{
	size_t total_size = 0;

	bool has_lm_head = false;

	while (true) {
	int32_t n_dims;
	int32_t length;
	int32_t ttype;

	fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
	fin.read(reinterpret_cast<char *>(&length), sizeof(length));
	fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));

	if (fin.eof()) {
	break;
	}

	int32_t nelements = 1;
	int32_t ne[2] = {1, 1};
	for (int i = 0; i < n_dims; ++i) {
	fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
	nelements *= ne[i];
	}

	std::string name(length, 0);
	fin.read(&name[0], length);

	printf("processing tensor '%s' in model file\n", name.data());

	if (model.tensors.find(name.data()) == model.tensors.end()) {
	fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__,
	name.data());
	return false;
	}

	auto tensor = model.tensors[name.data()];
	if (ggml_nelements(tensor) != nelements) {
	fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n",
	__func__, name.data());
	return false;
	}

	if (tensor->ne[0] != ne[0] \|\| tensor->ne[1] != ne[1]) {
	fprintf(stderr,
	"%s: tensor '%s' has wrong shape in model file: got [%d, %d], "
	"expected [%d, %d]\n",
	__func__, name.data(), (int)tensor->ne[0], (int)tensor->ne[1],
	ne[0], ne[1]);
	return false;
	}

	// for debugging
	if (1) {
	printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n",
	name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)),
	ggml_nbytes(tensor) / 1024.0 / 1024.0, ggml_nbytes(tensor));
	}

	const size_t bpe = ggml_type_size(ggml_type(ttype));

	if ((nelements * bpe) / ggml_blck_size(tensor->type) !=
	ggml_nbytes(tensor)) {
	fprintf(stderr,
	"%s: tensor '%s' has wrong size in model file: got %zu, "
	"expected %zu\n",
	__func__, name.data(), ggml_nbytes(tensor), nelements * bpe);
	return false;
	}

	fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));


	total_size += ggml_nbytes(tensor);
	}

	printf("%s: model size = %8.2f MB\n", __func__,
	total_size / 1024.0 / 1024.0);
	}

	fin.close();

	return true;
	}

	int main(int argc, char **argv) {
	btlm_params params;
	btlm_model models;
	btlm_vocab vocab;

	btlm_model_load(params.model, models, vocab);

	return 0;
	}