|
|
#include "ggml.h"
|
|
|
#include "ggml-cpp.h"
|
|
|
#include "ggml-cpu.h"
|
|
|
#include "ggml-alloc.h"
|
|
|
#include "ggml-backend.h"
|
|
|
#include "gguf.h"
|
|
|
|
|
|
#include "common.h"
|
|
|
#include "mimi-model.h"
|
|
|
|
|
|
#include <limits.h>
|
|
|
#include <vector>
|
|
|
#include <cinttypes>
|
|
|
#include <fstream>
|
|
|
#include <algorithm>
|
|
|
#include <unordered_map>
|
|
|
#include <float.h>
|
|
|
#include <cmath>
|
|
|
#include <cstdarg>
|
|
|
#include <functional>
|
|
|
#include <array>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct mimi_config_t {
|
|
|
bool causal = true;
|
|
|
int sample_rate = 24000;
|
|
|
int max_position_embeddings = 8000;
|
|
|
int num_hidden_layers = 8;
|
|
|
int n_embd = 512;
|
|
|
int n_ffn = 2048;
|
|
|
int n_head = 8;
|
|
|
int n_head_kv = 8;
|
|
|
int n_rot = 64;
|
|
|
float norm_eps = 1e-5;
|
|
|
float rope_theta = 10000.0f;
|
|
|
int sliding_window = 250;
|
|
|
std::array<int, 4> upsampling_ratio = {8, 6, 5, 4};
|
|
|
std::array<int, 4> downsampling_ratio = {4, 5, 6, 8};
|
|
|
|
|
|
float frame_rate = 12.5;
|
|
|
int audio_channels = 1;
|
|
|
int codebook_size = 2048;
|
|
|
int codebook_dim = 256;
|
|
|
int n_semantic_components = 1;
|
|
|
int n_acoustic_components = 31;
|
|
|
|
|
|
float trim_right_ratio = 1.0f;
|
|
|
int n_codes_per_frame = (sliding_window / 2) * (n_semantic_components + n_acoustic_components);
|
|
|
} mimi_config;
|
|
|
|
|
|
|
|
|
struct mimi_ggml_ctx {
|
|
|
gguf_context * ctx_gguf = nullptr;
|
|
|
ggml_context * ctx_data = nullptr;
|
|
|
ggml_context * ctx_gf = nullptr;
|
|
|
|
|
|
|
|
|
ggml_backend_t backend = nullptr;
|
|
|
ggml_backend_buffer_t buf = nullptr;
|
|
|
ggml_backend_sched_ptr sched;
|
|
|
|
|
|
ggml_cgraph * gf = nullptr;
|
|
|
std::vector<uint8_t> buf_compute_meta;
|
|
|
int max_nodes = 16 * 1024;
|
|
|
|
|
|
std::unordered_map<std::string, ggml_tensor *> tensors;
|
|
|
|
|
|
mimi_ggml_ctx() {
|
|
|
backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
|
|
auto buft = ggml_backend_get_default_buffer_type(backend);
|
|
|
sched.reset(
|
|
|
ggml_backend_sched_new(&backend, &buft, 1, max_nodes, false)
|
|
|
);
|
|
|
buf_compute_meta.resize(max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
|
|
|
}
|
|
|
|
|
|
void load_gguf(const char * fname) {
|
|
|
ggml_context * meta = nullptr;
|
|
|
|
|
|
gguf_init_params params = {
|
|
|
true,
|
|
|
&meta,
|
|
|
};
|
|
|
|
|
|
ctx_gguf = gguf_init_from_file(fname, params);
|
|
|
|
|
|
|
|
|
const int n_tensors = gguf_get_n_tensors(ctx_gguf);
|
|
|
|
|
|
std::vector<uint8_t> read_buf;
|
|
|
ggml_init_params ggml_params = {
|
|
|
(n_tensors + 1) * ggml_tensor_overhead(),
|
|
|
NULL,
|
|
|
true,
|
|
|
};
|
|
|
|
|
|
ctx_data = ggml_init(ggml_params);
|
|
|
auto fin = std::ifstream(fname, std::ios::binary);
|
|
|
if (!fin) {
|
|
|
ggml_free(meta);
|
|
|
throw std::runtime_error("cannot open model file for loading tensors");
|
|
|
}
|
|
|
|
|
|
|
|
|
for (int i = 0; i < n_tensors; ++i) {
|
|
|
const char * name = gguf_get_tensor_name(ctx_gguf, i);
|
|
|
ggml_tensor * t = ggml_get_tensor(meta, name);
|
|
|
ggml_tensor * cur = ggml_dup_tensor(ctx_data, t);
|
|
|
ggml_set_name(cur, name);
|
|
|
tensors.insert({name, cur});
|
|
|
}
|
|
|
|
|
|
|
|
|
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
|
|
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_data, buft);
|
|
|
ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
|
|
for (int i = 0; i < n_tensors; ++i) {
|
|
|
const char * name = gguf_get_tensor_name(ctx_gguf, i);
|
|
|
ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
|
|
const size_t offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i);
|
|
|
|
|
|
fin.seekg(offset, std::ios::beg);
|
|
|
if (!fin) {
|
|
|
ggml_free(meta);
|
|
|
throw std::runtime_error(string_format("failed to seek for tensor: %s", name));
|
|
|
}
|
|
|
int num_bytes = ggml_nbytes(cur);
|
|
|
if (ggml_backend_buft_is_host(buft)) {
|
|
|
|
|
|
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
|
|
} else {
|
|
|
|
|
|
read_buf.resize(num_bytes);
|
|
|
fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
|
|
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
|
|
}
|
|
|
}
|
|
|
printf("%s: Loaded %d tensors from %s\n", __func__, n_tensors, fname);
|
|
|
fin.close();
|
|
|
|
|
|
ggml_free(meta);
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void build_graph(std::function<void(ggml_context *, ggml_cgraph *)> builder_fn) {
|
|
|
ggml_free(ctx_gf);
|
|
|
struct ggml_init_params params = {
|
|
|
buf_compute_meta.size(),
|
|
|
buf_compute_meta.data(),
|
|
|
true,
|
|
|
};
|
|
|
|
|
|
ctx_gf = ggml_init(params);
|
|
|
ggml_backend_sched_reset(sched.get());
|
|
|
gf = ggml_new_graph_custom(ctx_gf, max_nodes, false);
|
|
|
|
|
|
builder_fn(ctx_gf, gf);
|
|
|
ggml_backend_sched_alloc_graph(sched.get(), gf);
|
|
|
}
|
|
|
|
|
|
ggml_status compute() {
|
|
|
ggml_status status = ggml_backend_sched_graph_compute(sched.get(), gf);
|
|
|
return status;
|
|
|
}
|
|
|
|
|
|
void set_tensor_data(const std::string & name, const void * data) {
|
|
|
ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str());
|
|
|
if (!t) {
|
|
|
throw std::runtime_error(string_format("tensor not found: %s", name.c_str()));
|
|
|
}
|
|
|
ggml_backend_tensor_set(t, data, 0, ggml_nbytes(t));
|
|
|
}
|
|
|
|
|
|
std::pair<ggml_tensor *, std::vector<uint8_t>> get_tensor_data(const std::string & name) {
|
|
|
ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str());
|
|
|
if (!t) {
|
|
|
throw std::runtime_error(string_format("tensor not found: %s", name.c_str()));
|
|
|
}
|
|
|
std::vector<uint8_t> data(ggml_nbytes(t));
|
|
|
ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
|
|
|
return std::make_pair(t, data);
|
|
|
}
|
|
|
|
|
|
ggml_tensor * get_weight(const char *fmt, ...) {
|
|
|
std::vector<char> str(128);
|
|
|
va_list va;
|
|
|
va_start(va, fmt);
|
|
|
vsnprintf(str.data(), 128, fmt, va);
|
|
|
va_end(va);
|
|
|
auto it = tensors.find(str.data());
|
|
|
if (it == tensors.end()) {
|
|
|
throw std::runtime_error(string_format("weight tensor not found: %s", str.data()));
|
|
|
}
|
|
|
return it->second;
|
|
|
}
|
|
|
|
|
|
~mimi_ggml_ctx() {
|
|
|
ggml_free(ctx_data);
|
|
|
gguf_free(ctx_gguf);
|
|
|
ggml_backend_buffer_free(buf);
|
|
|
}
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static ggml_tensor * ggml_pad_ext(ggml_context * ctx0, ggml_tensor * x, int mode,
|
|
|
int64_t pad_left, int64_t pad_right, float value = 0.0f) {
|
|
|
GGML_ASSERT(value == 0.0f);
|
|
|
GGML_ASSERT(mode == 0 || mode == 2);
|
|
|
if (pad_left > 0) {
|
|
|
ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_left, x->ne[1]);
|
|
|
if (mode == 0) {
|
|
|
tmp = ggml_scale(ctx0, tmp, value);
|
|
|
} else if (mode == 2) {
|
|
|
ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], 0);
|
|
|
tmp = ggml_repeat(ctx0, elem, tmp);
|
|
|
}
|
|
|
x = ggml_concat(ctx0, tmp, x, 0);
|
|
|
}
|
|
|
if (pad_right > 0) {
|
|
|
ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_right, x->ne[1]);
|
|
|
if (mode == 0) {
|
|
|
tmp = ggml_scale(ctx0, tmp, value);
|
|
|
} else if (mode == 2) {
|
|
|
int64_t last = x->ne[0] - 1;
|
|
|
ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], last * ggml_element_size(x));
|
|
|
tmp = ggml_repeat(ctx0, elem, tmp);
|
|
|
}
|
|
|
x = ggml_concat(ctx0, x, tmp, 0);
|
|
|
}
|
|
|
return x;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static int64_t div_ceil(int64_t a, int64_t b) {
|
|
|
return a / b + (a % b ? 1 : 0);
|
|
|
}
|
|
|
|
|
|
static ggml_tensor * mimi_conv_1d(ggml_context * ctx0, ggml_tensor * x,
|
|
|
ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool pad_zero = true) {
|
|
|
int64_t kernel_size = (kernel->ne[0] - 1) * dilation + 1;
|
|
|
int64_t p_total = kernel_size - stride;
|
|
|
int64_t p_half = p_total / 2;
|
|
|
|
|
|
int64_t n_frames = div_ceil(x->ne[0] - kernel_size + p_total, stride);
|
|
|
int64_t ideal_len = n_frames * stride + kernel_size - p_total;
|
|
|
int64_t p_extra = ideal_len - x->ne[0];
|
|
|
|
|
|
int64_t p_right = (mimi_config.causal ? 0 : p_half) + p_extra;
|
|
|
int64_t p_left = p_total - (mimi_config.causal ? 0 : p_half);
|
|
|
|
|
|
x = ggml_pad_ext(ctx0, x, pad_zero ? 0 : 2, p_left, p_right);
|
|
|
|
|
|
x = ggml_conv_1d(ctx0, kernel, x, stride, 0, dilation);
|
|
|
if (bias) {
|
|
|
x = ggml_add(ctx0, x, bias);
|
|
|
}
|
|
|
ggml_set_name(x, "mimi_conv_1d");
|
|
|
return x;
|
|
|
}
|
|
|
|
|
|
static ggml_tensor * mimi_conv_transpose_1d(ggml_context * ctx0, ggml_tensor * x,
|
|
|
ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool depthwise) {
|
|
|
GGML_ASSERT(x->ne[1] == kernel->ne[2]);
|
|
|
int64_t n_rows = x->ne[1];
|
|
|
int64_t kernel_size = kernel->ne[0];
|
|
|
int64_t p_total = kernel_size - stride;
|
|
|
|
|
|
int64_t p_right = mimi_config.causal
|
|
|
? (float)p_total / mimi_config.trim_right_ratio
|
|
|
: p_total / 2;
|
|
|
int64_t p_left = p_total - p_right;
|
|
|
|
|
|
ggml_tensor * out = nullptr;
|
|
|
|
|
|
if (depthwise) {
|
|
|
for (int64_t ir = 0; ir < n_rows; ir++) {
|
|
|
ggml_tensor * row = ggml_view_1d(ctx0, x,
|
|
|
x->ne[0], ir*x->ne[0]*ggml_element_size(x));
|
|
|
ggml_tensor * krn = ggml_view_1d(ctx0, kernel,
|
|
|
kernel->ne[0], ir*kernel->ne[0]*ggml_element_size(kernel));
|
|
|
row = ggml_conv_transpose_1d(ctx0, krn, row, stride, 0, dilation);
|
|
|
|
|
|
row = ggml_view_1d(ctx0, row, row->ne[0] - p_total, p_left*ggml_element_size(row));
|
|
|
|
|
|
|
|
|
out = out ? ggml_concat(ctx0, out, row, 1) : row;
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
out = ggml_conv_transpose_1d(ctx0, kernel, x, stride, 0, dilation);
|
|
|
|
|
|
out = ggml_view_2d(ctx0, out,
|
|
|
out->ne[0] - p_total, out->ne[1],
|
|
|
out->nb[1], p_left*ggml_element_size(out));
|
|
|
}
|
|
|
|
|
|
if (bias) {
|
|
|
out = ggml_add(ctx0, out, bias);
|
|
|
}
|
|
|
|
|
|
return out;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct mimi_encoder_decoder {
|
|
|
mimi_ggml_ctx & ctx;
|
|
|
struct layer {
|
|
|
bool is_elu = false;
|
|
|
bool is_resnet = false;
|
|
|
bool is_transposed_conv = false;
|
|
|
ggml_tensor * conv_0_w = nullptr;
|
|
|
ggml_tensor * conv_0_b = nullptr;
|
|
|
ggml_tensor * conv_1_w = nullptr;
|
|
|
ggml_tensor * conv_1_b = nullptr;
|
|
|
int stride = 1;
|
|
|
};
|
|
|
std::vector<layer> layers;
|
|
|
|
|
|
std::array<int, 4> repeated_pattern = {1, 4, 7, 10};
|
|
|
|
|
|
mimi_encoder_decoder(mimi_ggml_ctx & ctx): ctx(ctx) {
|
|
|
layers.push_back({
|
|
|
.conv_0_w = ctx.get_weight("decoder.layers.0.conv.weight"),
|
|
|
.conv_0_b = ctx.get_weight("decoder.layers.0.conv.bias"),
|
|
|
});
|
|
|
for (int i = 0; i < (int)repeated_pattern.size(); ++i) {
|
|
|
int i_start = repeated_pattern[i];
|
|
|
|
|
|
layers.push_back({
|
|
|
.is_elu = true,
|
|
|
});
|
|
|
layers.push_back({
|
|
|
.is_transposed_conv = true,
|
|
|
.conv_0_w = ctx.get_weight("decoder.layers.%d.conv.weight", i_start + 1),
|
|
|
.conv_0_b = ctx.get_weight("decoder.layers.%d.conv.bias", i_start + 1),
|
|
|
.stride = mimi_config.upsampling_ratio[i],
|
|
|
});
|
|
|
|
|
|
layers.push_back({
|
|
|
.is_resnet = true,
|
|
|
.conv_0_w = ctx.get_weight("decoder.layers.%d.block.1.conv.weight", i_start + 2),
|
|
|
.conv_0_b = ctx.get_weight("decoder.layers.%d.block.1.conv.bias", i_start + 2),
|
|
|
.conv_1_w = ctx.get_weight("decoder.layers.%d.block.3.conv.weight", i_start + 2),
|
|
|
.conv_1_b = ctx.get_weight("decoder.layers.%d.block.3.conv.bias", i_start + 2),
|
|
|
});
|
|
|
}
|
|
|
layers.push_back({
|
|
|
.is_elu = true,
|
|
|
});
|
|
|
layers.push_back({
|
|
|
.conv_0_w = ctx.get_weight("decoder.layers.14.conv.weight"),
|
|
|
.conv_0_b = ctx.get_weight("decoder.layers.14.conv.bias"),
|
|
|
});
|
|
|
}
|
|
|
|
|
|
ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input) {
|
|
|
ggml_tensor * x = input;
|
|
|
|
|
|
for (auto & layer : layers) {
|
|
|
if (layer.is_elu) {
|
|
|
x = ggml_elu(ctx0, x);
|
|
|
} else if (layer.is_resnet) {
|
|
|
ggml_tensor * residual = x;
|
|
|
x = ggml_elu(ctx0, x);
|
|
|
x = mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, 1, 1);
|
|
|
x = ggml_elu(ctx0, x);
|
|
|
x = mimi_conv_1d(ctx0, x, layer.conv_1_w, layer.conv_1_b, 1, 1);
|
|
|
x = ggml_add(ctx0, x, residual);
|
|
|
} else {
|
|
|
x = layer.is_transposed_conv
|
|
|
? mimi_conv_transpose_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1, false)
|
|
|
: mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return x;
|
|
|
}
|
|
|
};
|
|
|
|
|
|
struct mimi_transformer {
|
|
|
struct layer {
|
|
|
ggml_tensor * inp_norm_w = nullptr;
|
|
|
ggml_tensor * inp_norm_b = nullptr;
|
|
|
|
|
|
ggml_tensor * attn_q = nullptr;
|
|
|
ggml_tensor * attn_k = nullptr;
|
|
|
ggml_tensor * attn_v = nullptr;
|
|
|
ggml_tensor * attn_o = nullptr;
|
|
|
ggml_tensor * attn_post_norm_w = nullptr;
|
|
|
ggml_tensor * attn_post_norm_b = nullptr;
|
|
|
ggml_tensor * attn_layer_scale = nullptr;
|
|
|
|
|
|
ggml_tensor * ffn_up = nullptr;
|
|
|
ggml_tensor * ffn_down = nullptr;
|
|
|
ggml_tensor * mlp_layer_scale = nullptr;
|
|
|
};
|
|
|
std::vector<layer> layers;
|
|
|
|
|
|
mimi_transformer(mimi_ggml_ctx & ctx, const char * prefix, int n_layers) {
|
|
|
for (int il = 0; il < n_layers; il++) {
|
|
|
layers.push_back({
|
|
|
.inp_norm_w = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.weight", prefix, il),
|
|
|
.inp_norm_b = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.bias", prefix, il),
|
|
|
|
|
|
.attn_q = ctx.get_weight("%s_transformer.layers.%d.self_attn.q_proj.weight", prefix, il),
|
|
|
.attn_k = ctx.get_weight("%s_transformer.layers.%d.self_attn.k_proj.weight", prefix, il),
|
|
|
.attn_v = ctx.get_weight("%s_transformer.layers.%d.self_attn.v_proj.weight", prefix, il),
|
|
|
.attn_o = ctx.get_weight("%s_transformer.layers.%d.self_attn.o_proj.weight", prefix, il),
|
|
|
.attn_post_norm_w = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.weight", prefix, il),
|
|
|
.attn_post_norm_b = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.bias", prefix, il),
|
|
|
.attn_layer_scale = ctx.get_weight("%s_transformer.layers.%d.self_attn_layer_scale.scale", prefix, il),
|
|
|
|
|
|
.ffn_up = ctx.get_weight("%s_transformer.layers.%d.mlp.fc1.weight", prefix, il),
|
|
|
.ffn_down = ctx.get_weight("%s_transformer.layers.%d.mlp.fc2.weight", prefix, il),
|
|
|
.mlp_layer_scale = ctx.get_weight("%s_transformer.layers.%d.mlp_layer_scale.scale", prefix, il),
|
|
|
});
|
|
|
}
|
|
|
}
|
|
|
|
|
|
ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input, ggml_tensor * inp_pos) {
|
|
|
int n_tokens = input->ne[1];
|
|
|
ggml_tensor * x = input;
|
|
|
|
|
|
auto layer_norm = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) {
|
|
|
x = ggml_norm(ctx0, x, mimi_config.norm_eps);
|
|
|
x = ggml_mul(ctx0, x, w);
|
|
|
x = ggml_add(ctx0, x, b);
|
|
|
return x;
|
|
|
};
|
|
|
|
|
|
ggml_tensor * residual = input;
|
|
|
|
|
|
for (auto & layer : layers) {
|
|
|
residual = x;
|
|
|
|
|
|
|
|
|
x = layer_norm(x, layer.inp_norm_w, layer.inp_norm_b);
|
|
|
|
|
|
|
|
|
{
|
|
|
ggml_tensor * q = ggml_mul_mat(ctx0, layer.attn_q, x);
|
|
|
ggml_tensor * k = ggml_mul_mat(ctx0, layer.attn_k, x);
|
|
|
ggml_tensor * v = ggml_mul_mat(ctx0, layer.attn_v, x);
|
|
|
|
|
|
int n_embd_head = mimi_config.n_embd / mimi_config.n_head;
|
|
|
q = ggml_reshape_3d(ctx0, q, n_embd_head, mimi_config.n_head, n_tokens);
|
|
|
k = ggml_reshape_3d(ctx0, k, n_embd_head, mimi_config.n_head_kv, n_tokens);
|
|
|
v = ggml_reshape_3d(ctx0, v, n_embd_head, mimi_config.n_head_kv, n_tokens);
|
|
|
|
|
|
int n_rot = n_embd_head;
|
|
|
q = ggml_rope_inplace(ctx0, q, inp_pos, n_rot, 0);
|
|
|
q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3));
|
|
|
|
|
|
k = ggml_rope_inplace(ctx0, k, inp_pos, n_rot, 0);
|
|
|
k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3));
|
|
|
|
|
|
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
|
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
|
|
kq = ggml_scale_inplace(ctx0, kq, 1.0f / std::sqrt(n_embd_head));
|
|
|
ggml_tensor * kq_masked = ggml_diag_mask_inf_inplace(ctx0, kq, n_tokens);
|
|
|
kq = ggml_soft_max_inplace(ctx0, kq_masked);
|
|
|
|
|
|
v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3));
|
|
|
|
|
|
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
|
|
|
kqv = ggml_reshape_3d(ctx0, kqv, n_embd_head, n_tokens, mimi_config.n_head);
|
|
|
kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
|
|
kqv = ggml_cont_2d(ctx0, kqv, mimi_config.n_embd, n_tokens);
|
|
|
|
|
|
x = ggml_mul_mat(ctx0, layer.attn_o, kqv);
|
|
|
}
|
|
|
|
|
|
|
|
|
x = ggml_mul(ctx0, x, layer.attn_layer_scale);
|
|
|
x = ggml_add(ctx0, x, residual);
|
|
|
|
|
|
residual = x;
|
|
|
x = layer_norm(x, layer.attn_post_norm_w, layer.attn_post_norm_b);
|
|
|
|
|
|
|
|
|
{
|
|
|
x = ggml_mul_mat(ctx0, layer.ffn_up, x);
|
|
|
x = ggml_gelu(ctx0, x);
|
|
|
x = ggml_mul_mat(ctx0, layer.ffn_down, x);
|
|
|
}
|
|
|
|
|
|
|
|
|
x = ggml_mul(ctx0, x, layer.mlp_layer_scale);
|
|
|
x = ggml_add(ctx0, x, residual);
|
|
|
}
|
|
|
|
|
|
return x;
|
|
|
}
|
|
|
};
|
|
|
|
|
|
struct mimi_residual_vector_quantizer {
|
|
|
struct component {
|
|
|
ggml_tensor * codebook;
|
|
|
};
|
|
|
|
|
|
ggml_tensor * semantic_inp_proj;
|
|
|
std::vector<component> semantic_components;
|
|
|
ggml_tensor * semantic_out_proj;
|
|
|
|
|
|
ggml_tensor * acoustic_inp_proj;
|
|
|
std::vector<component> acoustic_components;
|
|
|
ggml_tensor * acoustic_out_proj;
|
|
|
|
|
|
mimi_residual_vector_quantizer(mimi_ggml_ctx & ctx) {
|
|
|
semantic_inp_proj = ctx.get_weight("quantizer.semantic_rvq.input_proj.weight");
|
|
|
semantic_out_proj = ctx.get_weight("quantizer.semantic_rvq.output_proj.weight");
|
|
|
for (int i = 0; i < mimi_config.n_semantic_components; i++) {
|
|
|
semantic_components.push_back({
|
|
|
.codebook = ctx.get_weight("quantizer.semantic_rvq.layers.%d.codebook", i),
|
|
|
});
|
|
|
}
|
|
|
acoustic_inp_proj = ctx.get_weight("quantizer.acoustic_rvq.input_proj.weight");
|
|
|
acoustic_out_proj = ctx.get_weight("quantizer.acoustic_rvq.output_proj.weight");
|
|
|
for (int i = 0; i < mimi_config.n_acoustic_components; i++) {
|
|
|
acoustic_components.push_back({
|
|
|
.codebook = ctx.get_weight("quantizer.acoustic_rvq.layers.%d.codebook", i),
|
|
|
});
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml_tensor * decode(ggml_context * ctx0, ggml_tensor * input) {
|
|
|
GGML_ASSERT(input->type == GGML_TYPE_I32);
|
|
|
|
|
|
size_t n_semantic = semantic_components.size();
|
|
|
int64_t n_codes_per_embd = (n_semantic + acoustic_components.size());
|
|
|
int64_t n_codes = input->ne[0] / n_codes_per_embd;
|
|
|
|
|
|
GGML_ASSERT(input->ne[0] % n_codes_per_embd == 0);
|
|
|
|
|
|
ggml_tensor * out_s = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes);
|
|
|
ggml_tensor * out_a = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes);
|
|
|
out_s = ggml_scale(ctx0, out_s, 0.0f);
|
|
|
out_a = ggml_scale(ctx0, out_a, 0.0f);
|
|
|
|
|
|
for (size_t ir = 0; ir < (size_t)n_codes_per_embd; ir++) {
|
|
|
ggml_tensor * row = ggml_view_1d(ctx0, input, n_codes, ir*n_codes*ggml_element_size(input));
|
|
|
if (ir < n_semantic) {
|
|
|
|
|
|
ggml_tensor * codebook = semantic_components[ir].codebook;
|
|
|
ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row);
|
|
|
out_s = ggml_add(ctx0, out_s, embd);
|
|
|
} else {
|
|
|
|
|
|
ggml_tensor * codebook = acoustic_components[ir-n_semantic].codebook;
|
|
|
ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row);
|
|
|
out_a = ggml_add(ctx0, out_a, embd);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
out_s = ggml_mul_mat(ctx0, semantic_out_proj, out_s);
|
|
|
out_a = ggml_mul_mat(ctx0, acoustic_out_proj, out_a);
|
|
|
|
|
|
return ggml_add(ctx0, out_s, out_a);
|
|
|
}
|
|
|
};
|
|
|
|
|
|
|
|
|
mimi_model::mimi_model(const char * fname, bool verbose) : verbose(verbose) {
|
|
|
ctx.reset(new mimi_ggml_ctx());
|
|
|
ctx->load_gguf(fname);
|
|
|
|
|
|
|
|
|
seanet_dec .reset(new mimi_encoder_decoder(*ctx));
|
|
|
transformer_dec.reset(new mimi_transformer(*ctx, "decoder", mimi_config.num_hidden_layers));
|
|
|
quantizer .reset(new mimi_residual_vector_quantizer(*ctx));
|
|
|
}
|
|
|
|
|
|
mimi_model::~mimi_model() {
|
|
|
}
|
|
|
|
|
|
std::vector<float> mimi_model::decode_frame(const std::vector<int> & codes, int & n_past) {
|
|
|
|
|
|
int n_pos = -1;
|
|
|
int n_codes = codes.size();
|
|
|
int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components;
|
|
|
GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiply of n_codes_per_embd");
|
|
|
|
|
|
ctx->build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf) {
|
|
|
ggml_tensor * inp_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_codes);
|
|
|
ggml_set_name(inp_dec, "inp_dec");
|
|
|
ggml_set_input(inp_dec);
|
|
|
|
|
|
|
|
|
ggml_tensor * embeddings = quantizer->decode(ctx_gf, inp_dec);
|
|
|
|
|
|
|
|
|
embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings));
|
|
|
embeddings = mimi_conv_transpose_1d(ctx_gf, embeddings, ctx->get_weight("upsample.conv.weight"), nullptr, 2, 1, true);
|
|
|
|
|
|
|
|
|
n_pos = embeddings->ne[0];
|
|
|
ggml_tensor * pos_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_pos);
|
|
|
ggml_set_name(pos_dec, "pos_dec");
|
|
|
ggml_set_input(pos_dec);
|
|
|
embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings));
|
|
|
embeddings = transformer_dec->forward(ctx_gf, embeddings, pos_dec);
|
|
|
|
|
|
|
|
|
embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings));
|
|
|
ggml_tensor * output = seanet_dec->forward(ctx_gf, embeddings);
|
|
|
|
|
|
ggml_set_name(output, "output");
|
|
|
ggml_set_output(output);
|
|
|
ggml_build_forward_expand(gf, output);
|
|
|
});
|
|
|
|
|
|
|
|
|
GGML_ASSERT(n_pos <= mimi_config.sliding_window);
|
|
|
std::vector<int> pos_data(n_pos);
|
|
|
for (int i = 0; i < (int)pos_data.size(); i++) {
|
|
|
pos_data[i] = i + n_past;
|
|
|
}
|
|
|
if (verbose) {
|
|
|
printf("%s: n_pos: %d, n_past: %d\n", __func__, n_pos, n_past);
|
|
|
}
|
|
|
n_past += n_pos;
|
|
|
ctx->set_tensor_data("pos_dec", pos_data.data());
|
|
|
|
|
|
|
|
|
auto codes_T = mimi_model::transpose_input(codes);
|
|
|
ctx->set_tensor_data("inp_dec", codes_T.data());
|
|
|
|
|
|
ctx->compute();
|
|
|
|
|
|
auto output = ctx->get_tensor_data("output");
|
|
|
|
|
|
auto output_data = output.second;
|
|
|
|
|
|
|
|
|
std::vector<float> wav_data(output_data.size() / sizeof(float));
|
|
|
for (size_t i = 0; i < wav_data.size(); i++) {
|
|
|
wav_data[i] = ((float *)output_data.data())[i];
|
|
|
}
|
|
|
|
|
|
return wav_data;
|
|
|
}
|
|
|
|
|
|
std::vector<float> mimi_model::decode(const std::vector<int> & codes) {
|
|
|
std::vector<float> output;
|
|
|
|
|
|
if (verbose) {
|
|
|
printf("%s: n_codes: %zu\n", __func__, codes.size());
|
|
|
}
|
|
|
|
|
|
int64_t t_start = ggml_time_ms();
|
|
|
int n_frames = 0;
|
|
|
|
|
|
int n_past = 0;
|
|
|
for (size_t i = 0; i < codes.size(); i += mimi_config.n_codes_per_frame) {
|
|
|
size_t remaining = std::min((size_t)mimi_config.n_codes_per_frame, codes.size() - i);
|
|
|
std::vector<int> frame(codes.begin() + i, codes.begin() + i + remaining);
|
|
|
|
|
|
auto wav_data = decode_frame(frame, n_past);
|
|
|
output.insert(output.end(), wav_data.begin(), wav_data.end());
|
|
|
|
|
|
n_frames++;
|
|
|
}
|
|
|
|
|
|
int64_t t_end = ggml_time_ms();
|
|
|
if (verbose) {
|
|
|
printf("%s: n_frames: %d, time: %" PRId64 "ms, per_frame: %" PRId64 "ms\n", __func__, n_frames, t_end - t_start, (t_end - t_start) / n_frames);
|
|
|
}
|
|
|
|
|
|
return output;
|
|
|
}
|
|
|
|
|
|
std::vector<int> mimi_model::transpose_input(const std::vector<int> & codes) {
|
|
|
int n_codes = codes.size();
|
|
|
int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components;
|
|
|
GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiply of n_codes_per_embd");
|
|
|
|
|
|
std::vector<int> codes_T(n_codes);
|
|
|
for (int i = 0; i < n_codes / n_codes_per_embd; i++) {
|
|
|
for (int j = 0; j < n_codes_per_embd; j++) {
|
|
|
int src_idx = i * n_codes_per_embd + j;
|
|
|
int dst_idx = j * (n_codes / n_codes_per_embd) + i;
|
|
|
codes_T[dst_idx] = codes[src_idx];
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return codes_T;
|
|
|
}
|
|
|
|
|
|
int mimi_model::get_sample_rate() const {
|
|
|
return mimi_config.sample_rate;
|
|
|
}
|
|
|
|