sesame-csm-1b-GGUF-encoder / examples /tts /mimi-model.cpp

Upload folder using huggingface_hub

e80739d verified 7 months ago

29.2 kB

	#include "ggml.h"
	#include "ggml-cpp.h"
	#include "ggml-cpu.h"
	#include "ggml-alloc.h"
	#include "ggml-backend.h"
	#include "gguf.h"

	#include "common.h"
	#include "mimi-model.h"

	#include <limits.h>
	#include <vector>
	#include <cinttypes>
	#include <fstream>
	#include <algorithm>
	#include <unordered_map>
	#include <float.h>
	#include <cmath>
	#include <cstdarg>
	#include <functional>
	#include <array>

	/**
	* Implementation of Kyutai's Mimi model using GGML.
	* Based on this research: https://github.com/ngxson/ggml-easy/blob/master/demo/kyutai-mimi.cpp
	*
	* NOTE: only decoder is working for now.
	*
	* Background:
	* - The audio codes can be generated using any Mimi-based model, for example: Moshi, Hibiki, Sesame, etc
	* - Audio codes must be in the order: N semantic codes followed by (N*31) acoustic codes
	* (In other words, input matrix has shape 32 cols x N rows)
	*
	* How it works?
	* 1. Audio code passed to RVQ (mimi_residual_vector_quantizer) to get the latent code
	* 2. The latent code is passed to a mimi_conv_transpose_1d (depthwise) to upscale
	* 3. The upscaled code is passed to transformer, it converts N frames to N frames
	* 4. The output embeddings is then passed to SEANet (mimi_encoder_decoder) to get the final waveform
	* 5. Waveform is written to a file
	*/

	// copied from https://huggingface.co/kyutai/mimi/blob/main/config.json
	struct mimi_config_t {
	bool causal = true;
	int sample_rate = 24000;
	int max_position_embeddings = 8000;
	int num_hidden_layers = 8;
	int n_embd = 512;
	int n_ffn = 2048;
	int n_head = 8;
	int n_head_kv = 8;
	int n_rot = 64;
	float norm_eps = 1e-5;
	float rope_theta = 10000.0f;
	int sliding_window = 250;
	std::array<int, 4> upsampling_ratio = {8, 6, 5, 4};
	std::array<int, 4> downsampling_ratio = {4, 5, 6, 8}; // reverse of upsampling_ratio
	// vector quantizer
	float frame_rate = 12.5;
	int audio_channels = 1;
	int codebook_size = 2048;
	int codebook_dim = 256;
	int n_semantic_components = 1;
	int n_acoustic_components = 31;
	// decode
	float trim_right_ratio = 1.0f;
	int n_codes_per_frame = (sliding_window / 2) * (n_semantic_components + n_acoustic_components);
	} mimi_config;

	// Adapted from https://github.com/ngxson/ggml-easy/blob/master/ggml-easy.h
	struct mimi_ggml_ctx {
	gguf_context * ctx_gguf = nullptr;
	ggml_context * ctx_data = nullptr;
	ggml_context * ctx_gf = nullptr;

	// CPU-only for now, as many kernels are missing and we actually get less performance with GPU
	ggml_backend_t backend = nullptr;
	ggml_backend_buffer_t buf = nullptr;
	ggml_backend_sched_ptr sched;

	ggml_cgraph * gf = nullptr;
	std::vector<uint8_t> buf_compute_meta;
	int max_nodes = 16 * 1024;

	std::unordered_map<std::string, ggml_tensor *> tensors;

	mimi_ggml_ctx() {
	backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
	auto buft = ggml_backend_get_default_buffer_type(backend);
	sched.reset(
	ggml_backend_sched_new(&backend, &buft, 1, max_nodes, false)
	);
	buf_compute_meta.resize(max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
	}

	void load_gguf(const char * fname) {
	ggml_context * meta = nullptr;

	gguf_init_params params = {
	/.no_alloc = / true,
	/.ctx = / &meta,
	};

	ctx_gguf = gguf_init_from_file(fname, params);

	// load tensors
	const int n_tensors = gguf_get_n_tensors(ctx_gguf);

	std::vector<uint8_t> read_buf;
	ggml_init_params ggml_params = {
	/.mem_size =/ (n_tensors + 1) * ggml_tensor_overhead(),
	/.mem_buffer =/ NULL,
	/.no_alloc =/ true,
	};

	ctx_data = ggml_init(ggml_params);
	auto fin = std::ifstream(fname, std::ios::binary);
	if (!fin) {
	ggml_free(meta);
	throw std::runtime_error("cannot open model file for loading tensors");
	}

	// add tensors to context
	for (int i = 0; i < n_tensors; ++i) {
	const char * name = gguf_get_tensor_name(ctx_gguf, i);
	ggml_tensor * t = ggml_get_tensor(meta, name);
	ggml_tensor * cur = ggml_dup_tensor(ctx_data, t);
	ggml_set_name(cur, name);
	tensors.insert({name, cur});
	}

	// alloc memory and offload data
	ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
	buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_data, buft);
	ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
	for (int i = 0; i < n_tensors; ++i) {
	const char * name = gguf_get_tensor_name(ctx_gguf, i);
	ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
	const size_t offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i);
	// printf("%s: Loading tensor \"%s\"\n", __func__, name);
	fin.seekg(offset, std::ios::beg);
	if (!fin) {
	ggml_free(meta);
	throw std::runtime_error(string_format("failed to seek for tensor: %s", name));
	}
	int num_bytes = ggml_nbytes(cur);
	if (ggml_backend_buft_is_host(buft)) {
	// for the CPU and Metal backend, we can read directly into the tensor
	fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
	} else {
	// read into a temporary buffer first, then copy to device memory
	read_buf.resize(num_bytes);
	fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
	ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
	}
	}
	printf("%s: Loaded %d tensors from %s\n", __func__, n_tensors, fname);
	fin.close();

	ggml_free(meta);
	}

	/**
	* Build a cgraph using the given builder function.
	*
	* The built cgraph will be stored in `ctx.gf`
	*/
	void build_graph(std::function<void(ggml_context , ggml_cgraph )> builder_fn) {
	ggml_free(ctx_gf);
	struct ggml_init_params params = {
	/.mem_size =/ buf_compute_meta.size(),
	/.mem_buffer =/ buf_compute_meta.data(),
	/.no_alloc =/ true,
	};

	ctx_gf = ggml_init(params);
	ggml_backend_sched_reset(sched.get());
	gf = ggml_new_graph_custom(ctx_gf, max_nodes, false);

	builder_fn(ctx_gf, gf);
	ggml_backend_sched_alloc_graph(sched.get(), gf);
	}

	ggml_status compute() {
	ggml_status status = ggml_backend_sched_graph_compute(sched.get(), gf);
	return status;
	}

	void set_tensor_data(const std::string & name, const void * data) {
	ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str());
	if (!t) {
	throw std::runtime_error(string_format("tensor not found: %s", name.c_str()));
	}
	ggml_backend_tensor_set(t, data, 0, ggml_nbytes(t));
	}

	std::pair<ggml_tensor *, std::vector<uint8_t>> get_tensor_data(const std::string & name) {
	ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str());
	if (!t) {
	throw std::runtime_error(string_format("tensor not found: %s", name.c_str()));
	}
	std::vector<uint8_t> data(ggml_nbytes(t));
	ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
	return std::make_pair(t, data);
	}

	ggml_tensor * get_weight(const char *fmt, ...) {
	std::vector<char> str(128);
	va_list va;
	va_start(va, fmt);
	vsnprintf(str.data(), 128, fmt, va);
	va_end(va);
	auto it = tensors.find(str.data());
	if (it == tensors.end()) {
	throw std::runtime_error(string_format("weight tensor not found: %s", str.data()));
	}
	return it->second;
	}

	~mimi_ggml_ctx() {
	ggml_free(ctx_data);
	gguf_free(ctx_gguf);
	ggml_backend_buffer_free(buf);
	}
	};

	///////////////////////////////////////////////////////////////////////////
	// extension to ggml.h
	// TODO: add these ops to the library (ofc with a more optimized kernel)


	// mode: (0) constant, (1) reflect, (2) replicate, (3) circular
	// value is only used in "constant"
	// only "constant" with 0.0f and "replicate" are implemented here
	static ggml_tensor * ggml_pad_ext(ggml_context * ctx0, ggml_tensor * x, int mode,
	int64_t pad_left, int64_t pad_right, float value = 0.0f) {
	GGML_ASSERT(value == 0.0f); // we can technically use ggml_arange, but for simplication we only support 0.0f
	GGML_ASSERT(mode == 0 \|\| mode == 2);
	if (pad_left > 0) {
	ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_left, x->ne[1]);
	if (mode == 0) {
	tmp = ggml_scale(ctx0, tmp, value);
	} else if (mode == 2) {
	ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], 0); // get first column
	tmp = ggml_repeat(ctx0, elem, tmp);
	}
	x = ggml_concat(ctx0, tmp, x, 0);
	}
	if (pad_right > 0) {
	ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_right, x->ne[1]);
	if (mode == 0) {
	tmp = ggml_scale(ctx0, tmp, value);
	} else if (mode == 2) {
	int64_t last = x->ne[0] - 1;
	ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], last * ggml_element_size(x)); // get last column
	tmp = ggml_repeat(ctx0, elem, tmp);
	}
	x = ggml_concat(ctx0, x, tmp, 0);
	}
	return x;
	}




	///////////////////////////////////////////////////////////////////////////
	// MimiConv and MimiConvTranspose

	static int64_t div_ceil(int64_t a, int64_t b) {
	return a / b + (a % b ? 1 : 0);
	}

	static ggml_tensor * mimi_conv_1d(ggml_context * ctx0, ggml_tensor * x,
	ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool pad_zero = true) {
	int64_t kernel_size = (kernel->ne[0] - 1) * dilation + 1;
	int64_t p_total = kernel_size - stride; // padding total
	int64_t p_half = p_total / 2;

	int64_t n_frames = div_ceil(x->ne[0] - kernel_size + p_total, stride);
	int64_t ideal_len = n_frames * stride + kernel_size - p_total;
	int64_t p_extra = ideal_len - x->ne[0];

	int64_t p_right = (mimi_config.causal ? 0 : p_half) + p_extra;
	int64_t p_left = p_total - (mimi_config.causal ? 0 : p_half);

	x = ggml_pad_ext(ctx0, x, pad_zero ? 0 : 2, p_left, p_right);

	x = ggml_conv_1d(ctx0, kernel, x, stride, 0, dilation);
	if (bias) {
	x = ggml_add(ctx0, x, bias);
	}
	ggml_set_name(x, "mimi_conv_1d");
	return x;
	}

	static ggml_tensor * mimi_conv_transpose_1d(ggml_context * ctx0, ggml_tensor * x,
	ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool depthwise) {
	GGML_ASSERT(x->ne[1] == kernel->ne[2]);
	int64_t n_rows = x->ne[1];
	int64_t kernel_size = kernel->ne[0];
	int64_t p_total = kernel_size - stride; // padding total

	int64_t p_right = mimi_config.causal
	? (float)p_total / mimi_config.trim_right_ratio
	: p_total / 2;
	int64_t p_left = p_total - p_right;

	ggml_tensor * out = nullptr;

	if (depthwise) {
	for (int64_t ir = 0; ir < n_rows; ir++) {
	ggml_tensor * row = ggml_view_1d(ctx0, x,
	x->ne[0], irx->ne[0]ggml_element_size(x));
	ggml_tensor * krn = ggml_view_1d(ctx0, kernel,
	kernel->ne[0], irkernel->ne[0]ggml_element_size(kernel));
	row = ggml_conv_transpose_1d(ctx0, krn, row, stride, 0, dilation);
	// unpad (remove p_right and p_left columns)
	row = ggml_view_1d(ctx0, row, row->ne[0] - p_total, p_left*ggml_element_size(row));

	// TODO: concat can be slow, we should use ggml_view_1d/ggml_cpy to avoid realloc
	out = out ? ggml_concat(ctx0, out, row, 1) : row;
	}

	} else {
	out = ggml_conv_transpose_1d(ctx0, kernel, x, stride, 0, dilation);
	// unpad
	out = ggml_view_2d(ctx0, out,
	out->ne[0] - p_total, out->ne[1],
	out->nb[1], p_left*ggml_element_size(out));
	}

	if (bias) {
	out = ggml_add(ctx0, out, bias);
	}

	return out;
	}



	///////////////////////////////////////////////////////////////////////////

	// based on MimiEncoder
	// SEANet encoder as used by Mimi.
	struct mimi_encoder_decoder {
	mimi_ggml_ctx & ctx;
	struct layer {
	bool is_elu = false;
	bool is_resnet = false;
	bool is_transposed_conv = false;
	ggml_tensor * conv_0_w = nullptr;
	ggml_tensor * conv_0_b = nullptr;
	ggml_tensor * conv_1_w = nullptr;
	ggml_tensor * conv_1_b = nullptr;
	int stride = 1;
	};
	std::vector<layer> layers;

	std::array<int, 4> repeated_pattern = {1, 4, 7, 10};

	mimi_encoder_decoder(mimi_ggml_ctx & ctx): ctx(ctx) {
	layers.push_back({
	.conv_0_w = ctx.get_weight("decoder.layers.0.conv.weight"),
	.conv_0_b = ctx.get_weight("decoder.layers.0.conv.bias"),
	});
	for (int i = 0; i < (int)repeated_pattern.size(); ++i) {
	int i_start = repeated_pattern[i];
	// upsampling layers
	layers.push_back({
	.is_elu = true, // layer (i_start)
	});
	layers.push_back({
	.is_transposed_conv = true,
	.conv_0_w = ctx.get_weight("decoder.layers.%d.conv.weight", i_start + 1),
	.conv_0_b = ctx.get_weight("decoder.layers.%d.conv.bias", i_start + 1),
	.stride = mimi_config.upsampling_ratio[i],
	});
	// residual layers
	layers.push_back({
	.is_resnet = true,
	.conv_0_w = ctx.get_weight("decoder.layers.%d.block.1.conv.weight", i_start + 2),
	.conv_0_b = ctx.get_weight("decoder.layers.%d.block.1.conv.bias", i_start + 2),
	.conv_1_w = ctx.get_weight("decoder.layers.%d.block.3.conv.weight", i_start + 2),
	.conv_1_b = ctx.get_weight("decoder.layers.%d.block.3.conv.bias", i_start + 2),
	});
	}
	layers.push_back({
	.is_elu = true, // layer 13
	});
	layers.push_back({
	.conv_0_w = ctx.get_weight("decoder.layers.14.conv.weight"),
	.conv_0_b = ctx.get_weight("decoder.layers.14.conv.bias"),
	});
	}

	ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input) {
	ggml_tensor * x = input;

	for (auto & layer : layers) {
	if (layer.is_elu) {
	x = ggml_elu(ctx0, x);
	} else if (layer.is_resnet) {
	ggml_tensor * residual = x;
	x = ggml_elu(ctx0, x);
	x = mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, 1, 1);
	x = ggml_elu(ctx0, x);
	x = mimi_conv_1d(ctx0, x, layer.conv_1_w, layer.conv_1_b, 1, 1);
	x = ggml_add(ctx0, x, residual);
	} else {
	x = layer.is_transposed_conv
	? mimi_conv_transpose_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1, false)
	: mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1);
	}
	}

	return x;
	}
	};

	struct mimi_transformer {
	struct layer {
	ggml_tensor * inp_norm_w = nullptr;
	ggml_tensor * inp_norm_b = nullptr;

	ggml_tensor * attn_q = nullptr;
	ggml_tensor * attn_k = nullptr;
	ggml_tensor * attn_v = nullptr;
	ggml_tensor * attn_o = nullptr;
	ggml_tensor * attn_post_norm_w = nullptr;
	ggml_tensor * attn_post_norm_b = nullptr;
	ggml_tensor * attn_layer_scale = nullptr;

	ggml_tensor * ffn_up = nullptr;
	ggml_tensor * ffn_down = nullptr;
	ggml_tensor * mlp_layer_scale = nullptr;
	};
	std::vector<layer> layers;

	mimi_transformer(mimi_ggml_ctx & ctx, const char * prefix, int n_layers) {
	for (int il = 0; il < n_layers; il++) {
	layers.push_back({
	.inp_norm_w = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.weight", prefix, il),
	.inp_norm_b = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.bias", prefix, il),

	.attn_q = ctx.get_weight("%s_transformer.layers.%d.self_attn.q_proj.weight", prefix, il),
	.attn_k = ctx.get_weight("%s_transformer.layers.%d.self_attn.k_proj.weight", prefix, il),
	.attn_v = ctx.get_weight("%s_transformer.layers.%d.self_attn.v_proj.weight", prefix, il),
	.attn_o = ctx.get_weight("%s_transformer.layers.%d.self_attn.o_proj.weight", prefix, il),
	.attn_post_norm_w = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.weight", prefix, il),
	.attn_post_norm_b = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.bias", prefix, il),
	.attn_layer_scale = ctx.get_weight("%s_transformer.layers.%d.self_attn_layer_scale.scale", prefix, il),

	.ffn_up = ctx.get_weight("%s_transformer.layers.%d.mlp.fc1.weight", prefix, il),
	.ffn_down = ctx.get_weight("%s_transformer.layers.%d.mlp.fc2.weight", prefix, il),
	.mlp_layer_scale = ctx.get_weight("%s_transformer.layers.%d.mlp_layer_scale.scale", prefix, il),
	});
	}
	}

	ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input, ggml_tensor * inp_pos) {
	int n_tokens = input->ne[1];
	ggml_tensor * x = input;

	auto layer_norm = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) {
	x = ggml_norm(ctx0, x, mimi_config.norm_eps);
	x = ggml_mul(ctx0, x, w);
	x = ggml_add(ctx0, x, b);
	return x;
	};

	ggml_tensor * residual = input;

	for (auto & layer : layers) {
	residual = x;

	// input layer norm
	x = layer_norm(x, layer.inp_norm_w, layer.inp_norm_b);

	// self attention
	{
	ggml_tensor * q = ggml_mul_mat(ctx0, layer.attn_q, x);
	ggml_tensor * k = ggml_mul_mat(ctx0, layer.attn_k, x);
	ggml_tensor * v = ggml_mul_mat(ctx0, layer.attn_v, x);

	int n_embd_head = mimi_config.n_embd / mimi_config.n_head;
	q = ggml_reshape_3d(ctx0, q, n_embd_head, mimi_config.n_head, n_tokens);
	k = ggml_reshape_3d(ctx0, k, n_embd_head, mimi_config.n_head_kv, n_tokens);
	v = ggml_reshape_3d(ctx0, v, n_embd_head, mimi_config.n_head_kv, n_tokens);

	int n_rot = n_embd_head;
	q = ggml_rope_inplace(ctx0, q, inp_pos, n_rot, 0);
	q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3));

	k = ggml_rope_inplace(ctx0, k, inp_pos, n_rot, 0);
	k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3));

	ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
	ggml_mul_mat_set_prec(kq, GGML_PREC_F32); // mimic behavior of llama.cpp
	kq = ggml_scale_inplace(ctx0, kq, 1.0f / std::sqrt(n_embd_head));
	ggml_tensor * kq_masked = ggml_diag_mask_inf_inplace(ctx0, kq, n_tokens);
	kq = ggml_soft_max_inplace(ctx0, kq_masked);

	v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3));

	ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
	kqv = ggml_reshape_3d(ctx0, kqv, n_embd_head, n_tokens, mimi_config.n_head);
	kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
	kqv = ggml_cont_2d(ctx0, kqv, mimi_config.n_embd, n_tokens);

	x = ggml_mul_mat(ctx0, layer.attn_o, kqv);
	}

	// residual
	x = ggml_mul(ctx0, x, layer.attn_layer_scale);
	x = ggml_add(ctx0, x, residual);

	residual = x;
	x = layer_norm(x, layer.attn_post_norm_w, layer.attn_post_norm_b);

	// mlp
	{
	x = ggml_mul_mat(ctx0, layer.ffn_up, x);
	x = ggml_gelu(ctx0, x);
	x = ggml_mul_mat(ctx0, layer.ffn_down, x);
	}

	// residual
	x = ggml_mul(ctx0, x, layer.mlp_layer_scale);
	x = ggml_add(ctx0, x, residual);
	}

	return x;
	}
	};

	struct mimi_residual_vector_quantizer {
	struct component {
	ggml_tensor * codebook;
	};

	ggml_tensor * semantic_inp_proj;
	std::vector<component> semantic_components;
	ggml_tensor * semantic_out_proj;

	ggml_tensor * acoustic_inp_proj;
	std::vector<component> acoustic_components;
	ggml_tensor * acoustic_out_proj;

	mimi_residual_vector_quantizer(mimi_ggml_ctx & ctx) {
	semantic_inp_proj = ctx.get_weight("quantizer.semantic_rvq.input_proj.weight");
	semantic_out_proj = ctx.get_weight("quantizer.semantic_rvq.output_proj.weight");
	for (int i = 0; i < mimi_config.n_semantic_components; i++) {
	semantic_components.push_back({
	.codebook = ctx.get_weight("quantizer.semantic_rvq.layers.%d.codebook", i),
	});
	}
	acoustic_inp_proj = ctx.get_weight("quantizer.acoustic_rvq.input_proj.weight");
	acoustic_out_proj = ctx.get_weight("quantizer.acoustic_rvq.output_proj.weight");
	for (int i = 0; i < mimi_config.n_acoustic_components; i++) {
	acoustic_components.push_back({
	.codebook = ctx.get_weight("quantizer.acoustic_rvq.layers.%d.codebook", i),
	});
	}
	}

	// the input has shape [n_codes, n_codes_per_embd]
	// first row is semantic, the rest are acoustic
	// example: [ [semantic], [acoustic1], [acoustic2], ... ]
	ggml_tensor * decode(ggml_context * ctx0, ggml_tensor * input) {
	GGML_ASSERT(input->type == GGML_TYPE_I32);

	size_t n_semantic = semantic_components.size();
	int64_t n_codes_per_embd = (n_semantic + acoustic_components.size());
	int64_t n_codes = input->ne[0] / n_codes_per_embd;

	GGML_ASSERT(input->ne[0] % n_codes_per_embd == 0);

	ggml_tensor * out_s = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes);
	ggml_tensor * out_a = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes);
	out_s = ggml_scale(ctx0, out_s, 0.0f); // clear
	out_a = ggml_scale(ctx0, out_a, 0.0f); // clear

	for (size_t ir = 0; ir < (size_t)n_codes_per_embd; ir++) {
	ggml_tensor * row = ggml_view_1d(ctx0, input, n_codes, irn_codesggml_element_size(input));
	if (ir < n_semantic) {
	// semantic
	ggml_tensor * codebook = semantic_components[ir].codebook;
	ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row);
	out_s = ggml_add(ctx0, out_s, embd);
	} else {
	// acoustic
	ggml_tensor * codebook = acoustic_components[ir-n_semantic].codebook;
	ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row);
	out_a = ggml_add(ctx0, out_a, embd);
	}
	}

	out_s = ggml_mul_mat(ctx0, semantic_out_proj, out_s);
	out_a = ggml_mul_mat(ctx0, acoustic_out_proj, out_a);

	return ggml_add(ctx0, out_s, out_a);
	}
	};


	mimi_model::mimi_model(const char * fname, bool verbose) : verbose(verbose) {
	ctx.reset(new mimi_ggml_ctx());
	ctx->load_gguf(fname);

	// initialize components
	seanet_dec .reset(new mimi_encoder_decoder(*ctx));
	transformer_dec.reset(new mimi_transformer(*ctx, "decoder", mimi_config.num_hidden_layers));
	quantizer .reset(new mimi_residual_vector_quantizer(*ctx));
	}

	mimi_model::~mimi_model() {
	}

	std::vector<float> mimi_model::decode_frame(const std::vector<int> & codes, int & n_past) {
	// build cgraph
	int n_pos = -1;
	int n_codes = codes.size();
	int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components;
	GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiply of n_codes_per_embd");

	ctx->build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf) {
	ggml_tensor * inp_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_codes);
	ggml_set_name(inp_dec, "inp_dec");
	ggml_set_input(inp_dec);

	// RVQ
	ggml_tensor * embeddings = quantizer->decode(ctx_gf, inp_dec);

	// upsample
	embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings));
	embeddings = mimi_conv_transpose_1d(ctx_gf, embeddings, ctx->get_weight("upsample.conv.weight"), nullptr, 2, 1, true);

	// transformer
	n_pos = embeddings->ne[0];
	ggml_tensor * pos_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_pos);
	ggml_set_name(pos_dec, "pos_dec");
	ggml_set_input(pos_dec);
	embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings));
	embeddings = transformer_dec->forward(ctx_gf, embeddings, pos_dec);

	// SEANET decoder
	embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings));
	ggml_tensor * output = seanet_dec->forward(ctx_gf, embeddings);

	ggml_set_name(output, "output");
	ggml_set_output(output);
	ggml_build_forward_expand(gf, output);
	});

	// position data
	GGML_ASSERT(n_pos <= mimi_config.sliding_window);
	std::vector<int> pos_data(n_pos);
	for (int i = 0; i < (int)pos_data.size(); i++) {
	pos_data[i] = i + n_past;
	}
	if (verbose) {
	printf("%s: n_pos: %d, n_past: %d\n", __func__, n_pos, n_past);
	}
	n_past += n_pos;
	ctx->set_tensor_data("pos_dec", pos_data.data());

	// code data
	auto codes_T = mimi_model::transpose_input(codes);
	ctx->set_tensor_data("inp_dec", codes_T.data());

	ctx->compute();

	auto output = ctx->get_tensor_data("output");
	// auto output_tensor = output.first;
	auto output_data = output.second;
	// printf("Output shape: [%lld, %lld]\n", output_tensor->ne[0], output_tensor->ne[1]);

	std::vector<float> wav_data(output_data.size() / sizeof(float));
	for (size_t i = 0; i < wav_data.size(); i++) {
	wav_data[i] = ((float *)output_data.data())[i];
	}

	return wav_data;
	}

	std::vector<float> mimi_model::decode(const std::vector<int> & codes) {
	std::vector<float> output;

	if (verbose) {
	printf("%s: n_codes: %zu\n", __func__, codes.size());
	}

	int64_t t_start = ggml_time_ms();
	int n_frames = 0;

	int n_past = 0;
	for (size_t i = 0; i < codes.size(); i += mimi_config.n_codes_per_frame) {
	size_t remaining = std::min((size_t)mimi_config.n_codes_per_frame, codes.size() - i);
	std::vector<int> frame(codes.begin() + i, codes.begin() + i + remaining);

	auto wav_data = decode_frame(frame, n_past);
	output.insert(output.end(), wav_data.begin(), wav_data.end());

	n_frames++;
	}

	int64_t t_end = ggml_time_ms();
	if (verbose) {
	printf("%s: n_frames: %d, time: %" PRId64 "ms, per_frame: %" PRId64 "ms\n", __func__, n_frames, t_end - t_start, (t_end - t_start) / n_frames);
	}

	return output;
	}

	std::vector<int> mimi_model::transpose_input(const std::vector<int> & codes) {
	int n_codes = codes.size();
	int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components;
	GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiply of n_codes_per_embd");

	std::vector<int> codes_T(n_codes);
	for (int i = 0; i < n_codes / n_codes_per_embd; i++) {
	for (int j = 0; j < n_codes_per_embd; j++) {
	int src_idx = i * n_codes_per_embd + j;
	int dst_idx = j * (n_codes / n_codes_per_embd) + i;
	codes_T[dst_idx] = codes[src_idx];
	}
	}

	return codes_T;
	}

	int mimi_model::get_sample_rate() const {
	return mimi_config.sample_rate;
	}