| #include "llama-expert-cache-ctx.h" |
| #include "llama-model.h" |
| #include "llama-hparams.h" |
|
|
| #include "ggml.h" |
| #include "ggml-backend.h" |
|
|
| #include <cstdlib> |
| #include <cstring> |
| #include <set> |
| #include <algorithm> |
|
|
| #if !defined(_WIN32) |
| #include <sys/mman.h> |
| #endif |
|
|
| #ifndef MADV_WILLNEED |
| #define MADV_WILLNEED 3 |
| #endif |
|
|
| |
| void llama_expert_cache_ctx::init(const llama_model & model, size_t cache_bytes) { |
| const auto & hparams = model.hparams; |
|
|
| n_expert = (int)hparams.n_expert; |
| n_expert_used = (int)hparams.n_expert_used; |
| n_layers = (int)hparams.n_layer; |
|
|
| if (n_expert == 0 || n_expert_used == 0) { |
| |
| return; |
| } |
|
|
| |
| cache = std::make_unique<llama_expert_cache>(cache_bytes); |
|
|
| |
| expert_tensors.resize(n_layers); |
| expert_strides.resize(n_layers); |
|
|
| for (int il = 0; il < n_layers; il++) { |
| const auto & layer = model.layers[il]; |
|
|
| expert_tensors[il] = { |
| layer.ffn_up_exps, |
| layer.ffn_gate_exps, |
| layer.ffn_down_exps, |
| }; |
|
|
| for (int wt = 0; wt < 3; wt++) { |
| ggml_tensor * t = expert_tensors[il][wt]; |
| if (t && t->ne[2] > 1) { |
| |
| expert_strides[il][wt] = t->nb[2]; |
| } else { |
| expert_strides[il][wt] = 0; |
| } |
| } |
| } |
|
|
| |
| |
| size_t max_stride = 0; |
| for (int il = 0; il < n_layers; il++) { |
| for (int wt = 0; wt < 3; wt++) { |
| max_stride = std::max(max_stride, expert_strides[il][wt]); |
| } |
| } |
| active_buffer_size = (size_t)n_expert_used * max_stride; |
| active_buffer = malloc(active_buffer_size); |
|
|
| GGML_ASSERT(active_buffer != nullptr); |
|
|
| fprintf(stderr, "llama_expert_cache_ctx: initialized for %d layers, %d experts (%d used), " |
| "cache = %.1f MB, stride = %.2f MB\n", |
| n_layers, n_expert, n_expert_used, |
| (double)cache_bytes / (1024*1024), |
| (double)max_stride / (1024*1024)); |
| } |
|
|
| std::pair<int, int> llama_expert_cache_ctx::identify_tensor(const ggml_tensor * t) const { |
| for (int il = 0; il < n_layers; il++) { |
| for (int wt = 0; wt < 3; wt++) { |
| if (expert_tensors[il][wt] == t) { |
| return {il, wt}; |
| } |
| } |
| } |
| return {-1, -1}; |
| } |
|
|
| void * llama_expert_cache_ctx::build_active_buffer( |
| int layer, int weight_type, |
| const int32_t * expert_ids, int n_ids) { |
|
|
| const size_t stride = expert_strides[layer][weight_type]; |
| const ggml_tensor * stacked = expert_tensors[layer][weight_type]; |
|
|
| if (!stacked || stride == 0) return nullptr; |
|
|
| |
| char * dst = (char *)active_buffer; |
| for (int i = 0; i < n_ids; i++) { |
| int eid = expert_ids[i]; |
| if (eid < 0 || eid >= n_expert) continue; |
|
|
| llama_expert_key key = {(int32_t)layer, (int32_t)eid, (int32_t)weight_type}; |
|
|
| const char * expert_src = nullptr; |
|
|
| if (cache) { |
| auto [buf, hit] = cache->get_or_alloc(key, stride); |
| if (buf) { |
| if (!hit) { |
| |
| |
| |
| const char * src = (const char *)stacked->data + (size_t)eid * stride; |
| memcpy(buf, src, stride); |
| } |
| expert_src = (const char *)buf; |
| } |
| } |
|
|
| if (!expert_src) { |
| |
| expert_src = (const char *)stacked->data + (size_t)eid * stride; |
| } |
|
|
| memcpy(dst, expert_src, stride); |
| dst += stride; |
| } |
|
|
| return active_buffer; |
| } |
|
|
| |
| |
| |
| |
| bool llama_expert_cache_ctx::eval_callback( |
| struct ggml_tensor * t, |
| bool ask, |
| void * user_data) { |
|
|
| if (!ask) { |
| return true; |
| } |
|
|
| |
| if (t->op != GGML_OP_MUL_MAT_ID) { |
| return true; |
| } |
|
|
| auto * ctx = (llama_expert_cache_ctx *)user_data; |
|
|
| |
| |
| ggml_tensor * expert_weights = t->src[0]; |
| ggml_tensor * expert_indices = t->src[2]; |
|
|
| if (!expert_weights || !expert_indices || !ctx->cache) { |
| return true; |
| } |
|
|
| |
| auto [layer, weight_type] = ctx->identify_tensor(expert_weights); |
| if (layer < 0) { |
| return true; |
| } |
|
|
| |
| |
| |
| |
|
|
| const size_t stride = ctx->expert_strides[layer][weight_type]; |
| if (stride == 0) { |
| return true; |
| } |
|
|
| |
| |
| |
| bool indices_on_host = !expert_indices->buffer || |
| ggml_backend_buffer_is_host(expert_indices->buffer); |
| bool weights_on_host = !expert_weights->buffer || |
| ggml_backend_buffer_is_host(expert_weights->buffer); |
|
|
| if (!indices_on_host || !weights_on_host) { |
| |
| |
| static int skip_count = 0; |
| if (++skip_count <= 10) { |
| fprintf(stderr, "expert_cache: skip layer %d wt %d (GPU-resident)\n", layer, weight_type); |
| } else if (skip_count == 11) { |
| fprintf(stderr, "expert_cache: (suppressing further skip messages)\n"); |
| } |
| return true; |
| } |
|
|
| |
| |
| |
| { |
| static int mode = -1; |
| if (mode < 0) { |
| const char * noop = getenv("EXPERT_CACHE_NOOP"); |
| mode = (noop && noop[0] == '1') ? 1 : 0; |
| fprintf(stderr, "expert_cache: mode=%s\n", mode ? "NOOP" : "MADVISE"); |
| } |
| if (mode == 1) { |
| return true; |
| } |
| } |
|
|
| |
| { |
| static int advise_count = 0; |
| if (++advise_count <= 10) { |
| fprintf(stderr, "expert_cache: ADVISE layer %d wt %d (CPU, madvise)\n", layer, weight_type); |
| } else if (advise_count == 11) { |
| fprintf(stderr, "expert_cache: (suppressing further advise messages)\n"); |
| } |
| } |
| #if !defined(_WIN32) |
| if (expert_indices->data) { |
| const int32_t * ids = (const int32_t *)expert_indices->data; |
| int n_ids = (int)(ggml_nelements(expert_indices)); |
|
|
| for (int i = 0; i < n_ids; i++) { |
| int eid = ids[i]; |
| if (eid < 0 || eid >= ctx->n_expert) continue; |
|
|
| const char * src = (const char *)expert_weights->data + (size_t)eid * stride; |
| uintptr_t page_start = (uintptr_t)src & ~(uintptr_t)(4096 - 1); |
| size_t advise_len = stride + ((uintptr_t)src - page_start); |
| madvise((void *)page_start, advise_len, MADV_WILLNEED); |
| } |
| } |
| #endif |
|
|
| |
| |
| |
| |
| return true; |
| } |
|
|