| | #pragma once |
| |
|
| | #include "llama.h" |
| | #include "llama-cparams.h" |
| | #include "llama-graph.h" |
| | #include "llama-adapter.h" |
| |
|
| | #include "ggml-cpp.h" |
| | #include "ggml-opt.h" |
| |
|
| | #include <map> |
| | #include <vector> |
| |
|
| | struct llama_model; |
| | class llama_batch_allocr; |
| |
|
| | class llama_io_read_i; |
| | class llama_io_write_i; |
| |
|
| | struct llama_memory_i; |
| | struct llama_memory_context_i; |
| |
|
| | struct llama_context { |
| | |
| | llama_context( |
| | const llama_model & model, |
| | llama_context_params params); |
| |
|
| | ~llama_context(); |
| |
|
| | void synchronize(); |
| |
|
| | const llama_model & get_model() const; |
| | const llama_cparams & get_cparams() const; |
| |
|
| | ggml_backend_sched_t get_sched() const; |
| |
|
| | uint32_t n_ctx() const; |
| | uint32_t n_ctx_per_seq() const; |
| | uint32_t n_batch() const; |
| | uint32_t n_ubatch() const; |
| | uint32_t n_seq_max() const; |
| |
|
| | uint32_t n_threads() const; |
| | uint32_t n_threads_batch() const; |
| |
|
| | llama_memory_t get_memory() const; |
| |
|
| | |
| | |
| | bool kv_self_update(bool optimize); |
| | void kv_self_defrag_sched(); |
| |
|
| | enum llama_pooling_type pooling_type() const; |
| |
|
| | float * get_logits(); |
| | float * get_logits_ith(int32_t i); |
| |
|
| | float * get_embeddings(); |
| | float * get_embeddings_ith(int32_t i); |
| | float * get_embeddings_seq(llama_seq_id seq_id); |
| |
|
| | void attach_threadpool( |
| | ggml_threadpool_t threadpool, |
| | ggml_threadpool_t threadpool_batch); |
| |
|
| | void detach_threadpool(); |
| |
|
| | void set_n_threads(int32_t n_threads, int32_t n_threads_batch); |
| |
|
| | void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data); |
| |
|
| | void set_embeddings (bool value); |
| | void set_causal_attn(bool value); |
| | void set_warmup(bool value); |
| |
|
| | void set_adapter_lora( |
| | llama_adapter_lora * adapter, |
| | float scale); |
| |
|
| | bool rm_adapter_lora( |
| | llama_adapter_lora * adapter); |
| |
|
| | void clear_adapter_lora(); |
| |
|
| | bool apply_adapter_cvec( |
| | const float * data, |
| | size_t len, |
| | int32_t n_embd, |
| | int32_t il_start, |
| | int32_t il_end); |
| |
|
| | |
| | |
| | |
| | |
| | llm_graph_result * process_ubatch( |
| | const llama_ubatch & ubatch, |
| | llm_graph_type gtype, |
| | llama_memory_context_i * mctx, |
| | ggml_status & ret); |
| |
|
| | int encode(const llama_batch & batch_inp); |
| | int decode(const llama_batch & batch_inp); |
| |
|
| | |
| | |
| | |
| |
|
| | size_t state_get_size(); |
| | size_t state_get_data( uint8_t * dst, size_t size); |
| | size_t state_set_data(const uint8_t * src, size_t size); |
| |
|
| | size_t state_seq_get_size(llama_seq_id seq_id); |
| | size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size); |
| | size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size); |
| |
|
| | bool state_load_file( |
| | const char * filepath, |
| | llama_token * tokens_out, |
| | size_t n_token_capacity, |
| | size_t * n_token_count_out); |
| |
|
| | bool state_save_file( |
| | const char * filepath, |
| | const llama_token * tokens, |
| | size_t n_token_count); |
| |
|
| | size_t state_seq_load_file( |
| | llama_seq_id seq_id, |
| | const char * filepath, |
| | llama_token * tokens_out, |
| | size_t n_token_capacity, |
| | size_t * n_token_count_out); |
| |
|
| | size_t state_seq_save_file( |
| | llama_seq_id seq_id, |
| | const char * filepath, |
| | const llama_token * tokens, |
| | size_t n_token_count); |
| |
|
| | |
| | |
| | |
| |
|
| | llama_perf_context_data perf_get_data() const; |
| | void perf_reset(); |
| |
|
| | |
| | |
| | |
| |
|
| | void opt_init(struct llama_model * model, struct llama_opt_params lopt_params); |
| |
|
| | void opt_epoch( |
| | ggml_opt_dataset_t dataset, |
| | ggml_opt_result_t result_train, |
| | ggml_opt_result_t result_eval, |
| | int64_t idata_split, |
| | ggml_opt_epoch_callback callback_train, |
| | ggml_opt_epoch_callback callback_eval); |
| |
|
| | void opt_epoch_iter( |
| | ggml_opt_dataset_t dataset, |
| | ggml_opt_result_t result, |
| | const std::vector<llama_token> & tokens, |
| | const std::vector<llama_token> & labels_sparse, |
| | llama_batch & batch, |
| | ggml_opt_epoch_callback callback, |
| | bool train, |
| | int64_t idata_in_loop, |
| | int64_t ndata_in_loop, |
| | int64_t t_loop_start); |
| |
|
| | private: |
| | |
| | |
| | |
| |
|
| | |
| | |
| | uint32_t output_reserve(int32_t n_outputs); |
| |
|
| | |
| | |
| | |
| |
|
| | public: |
| | uint32_t graph_max_nodes() const; |
| |
|
| | |
| | llm_graph_result * get_gf_res_reserve() const; |
| |
|
| | |
| | ggml_status graph_compute(ggml_cgraph * gf, bool batched); |
| |
|
| | |
| | ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx); |
| |
|
| | private: |
| | llm_graph_params graph_params( |
| | llm_graph_result * res, |
| | const llama_ubatch & ubatch, |
| | const llama_memory_context_i * mctx, |
| | llm_graph_type gtype) const; |
| |
|
| | llm_graph_cb graph_get_cb() const; |
| |
|
| | |
| | size_t state_write_data(llama_io_write_i & io); |
| | size_t state_read_data (llama_io_read_i & io); |
| |
|
| | size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id); |
| | size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id); |
| |
|
| | |
| | |
| | |
| |
|
| | const llama_model & model; |
| |
|
| | llama_cparams cparams; |
| | llama_adapter_cvec cvec; |
| | llama_adapter_loras loras; |
| |
|
| | llama_cross cross; |
| |
|
| | std::unique_ptr<llama_memory_i> memory; |
| |
|
| | |
| | bool memory_force_optimize = false; |
| |
|
| | |
| | size_t logits_size = 0; |
| | float * logits = nullptr; |
| |
|
| | |
| | |
| | size_t embd_size = 0; |
| | float * embd = nullptr; |
| |
|
| | |
| | |
| | std::map<llama_seq_id, std::vector<float>> embd_seq; |
| |
|
| | |
| | std::unique_ptr<llama_batch_allocr> balloc; |
| |
|
| | uint32_t n_outputs = 0; |
| |
|
| | std::vector<int32_t> output_ids; |
| |
|
| | ggml_backend_sched_ptr sched; |
| |
|
| | ggml_backend_t backend_cpu = nullptr; |
| | std::vector<ggml_backend_ptr> backends; |
| |
|
| | |
| | ggml_opt_context_t opt_ctx = nullptr; |
| |
|
| | ggml_threadpool_t threadpool = nullptr; |
| | ggml_threadpool_t threadpool_batch = nullptr; |
| |
|
| | ggml_abort_callback abort_callback = nullptr; |
| | void * abort_callback_data = nullptr; |
| |
|
| | std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns; |
| |
|
| | |
| | std::vector<ggml_backend_t> backend_ptrs; |
| | std::vector<ggml_backend_buffer_type_t> backend_buft; |
| |
|
| | llm_graph_result_ptr gf_res_prev; |
| | llm_graph_result_ptr gf_res_reserve; |
| |
|
| | |
| | ggml_backend_buffer_ptr buf_output; |
| |
|
| | bool has_evaluated_once = false; |
| |
|
| | |
| | mutable int64_t t_start_us = 0; |
| | mutable int64_t t_load_us = 0; |
| | mutable int64_t t_p_eval_us = 0; |
| | mutable int64_t t_eval_us = 0; |
| |
|
| | mutable int64_t t_compute_start_us = 0; |
| | mutable int64_t n_queued_tokens = 0; |
| |
|
| | mutable int32_t n_p_eval = 0; |
| | mutable int32_t n_eval = 0; |
| |
|
| | mutable int32_t n_reused = 0; |
| | }; |
| |
|