|
|
|
|
| #pragma once
|
|
|
| #include "ggml-opt.h"
|
| #include "llama-cpp.h"
|
|
|
| #include <set>
|
| #include <sstream>
|
| #include <string>
|
| #include <string_view>
|
| #include <vector>
|
| #include <map>
|
|
|
| #if defined(_WIN32) && !defined(_WIN32_WINNT)
|
| #define _WIN32_WINNT 0x0A00
|
| #endif
|
|
|
| #ifdef _WIN32
|
| #define DIRECTORY_SEPARATOR '\\'
|
| #else
|
| #define DIRECTORY_SEPARATOR '/'
|
| #endif
|
|
|
| #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
| #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
|
|
| #define print_build_info() do { \
|
| fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
| fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
| } while(0)
|
|
|
| struct common_time_meas {
|
| common_time_meas(int64_t & t_acc, bool disable = false);
|
| ~common_time_meas();
|
|
|
| const int64_t t_start_us;
|
|
|
| int64_t & t_acc;
|
| };
|
|
|
| struct common_adapter_lora_info {
|
| std::string path;
|
| float scale;
|
|
|
| std::string task_name;
|
| std::string prompt_prefix;
|
|
|
| struct llama_adapter_lora * ptr;
|
| };
|
|
|
| using llama_tokens = std::vector<llama_token>;
|
|
|
|
|
| extern int LLAMA_BUILD_NUMBER;
|
| extern const char * LLAMA_COMMIT;
|
| extern const char * LLAMA_COMPILER;
|
| extern const char * LLAMA_BUILD_TARGET;
|
|
|
| const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
|
|
|
| struct common_control_vector_load_info;
|
|
|
|
|
|
|
|
|
|
|
| struct cpu_params {
|
| int n_threads = -1;
|
| bool cpumask[GGML_MAX_N_THREADS] = {false};
|
| bool mask_valid = false;
|
| enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL;
|
| bool strict_cpu = false;
|
| uint32_t poll = 50;
|
| };
|
|
|
| int32_t cpu_get_num_physical_cores();
|
| int32_t cpu_get_num_math();
|
|
|
|
|
|
|
|
|
|
|
| enum llama_example {
|
| LLAMA_EXAMPLE_BATCHED,
|
| LLAMA_EXAMPLE_DEBUG,
|
| LLAMA_EXAMPLE_COMMON,
|
| LLAMA_EXAMPLE_SPECULATIVE,
|
| LLAMA_EXAMPLE_COMPLETION,
|
| LLAMA_EXAMPLE_CLI,
|
| LLAMA_EXAMPLE_EMBEDDING,
|
| LLAMA_EXAMPLE_PERPLEXITY,
|
| LLAMA_EXAMPLE_RETRIEVAL,
|
| LLAMA_EXAMPLE_PASSKEY,
|
| LLAMA_EXAMPLE_IMATRIX,
|
| LLAMA_EXAMPLE_BENCH,
|
| LLAMA_EXAMPLE_SERVER,
|
| LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
| LLAMA_EXAMPLE_EXPORT_LORA,
|
| LLAMA_EXAMPLE_MTMD,
|
| LLAMA_EXAMPLE_LOOKUP,
|
| LLAMA_EXAMPLE_PARALLEL,
|
| LLAMA_EXAMPLE_TTS,
|
| LLAMA_EXAMPLE_DIFFUSION,
|
| LLAMA_EXAMPLE_FINETUNE,
|
| LLAMA_EXAMPLE_FIT_PARAMS,
|
|
|
| LLAMA_EXAMPLE_COUNT,
|
| };
|
|
|
| enum common_sampler_type {
|
| COMMON_SAMPLER_TYPE_NONE = 0,
|
| COMMON_SAMPLER_TYPE_DRY = 1,
|
| COMMON_SAMPLER_TYPE_TOP_K = 2,
|
| COMMON_SAMPLER_TYPE_TOP_P = 3,
|
| COMMON_SAMPLER_TYPE_MIN_P = 4,
|
|
|
| COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
|
| COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
|
| COMMON_SAMPLER_TYPE_XTC = 8,
|
| COMMON_SAMPLER_TYPE_INFILL = 9,
|
| COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
| COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
|
| COMMON_SAMPLER_TYPE_ADAPTIVE_P = 12,
|
| };
|
|
|
|
|
| enum dimre_method {
|
| DIMRE_METHOD_PCA,
|
| DIMRE_METHOD_MEAN,
|
| };
|
|
|
| enum common_conversation_mode {
|
| COMMON_CONVERSATION_MODE_DISABLED = 0,
|
| COMMON_CONVERSATION_MODE_ENABLED = 1,
|
| COMMON_CONVERSATION_MODE_AUTO = 2,
|
| };
|
|
|
| enum common_grammar_trigger_type {
|
| COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
|
| COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
| COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
| COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
| };
|
|
|
| struct common_grammar_trigger {
|
| common_grammar_trigger_type type;
|
| std::string value;
|
| llama_token token = LLAMA_TOKEN_NULL;
|
| };
|
|
|
| enum common_params_sampling_config : uint64_t {
|
| COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS = 1 << 0,
|
| COMMON_PARAMS_SAMPLING_CONFIG_TOP_K = 1 << 1,
|
| COMMON_PARAMS_SAMPLING_CONFIG_TOP_P = 1 << 2,
|
| COMMON_PARAMS_SAMPLING_CONFIG_MIN_P = 1 << 3,
|
| COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
|
| COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD = 1 << 5,
|
| COMMON_PARAMS_SAMPLING_CONFIG_TEMP = 1 << 6,
|
| COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N = 1 << 7,
|
| COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT = 1 << 8,
|
| COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT = 1 << 9,
|
| COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU = 1 << 10,
|
| COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA = 1 << 11,
|
| };
|
|
|
| enum common_speculative_type {
|
| COMMON_SPECULATIVE_TYPE_NONE,
|
| COMMON_SPECULATIVE_TYPE_DRAFT,
|
| COMMON_SPECULATIVE_TYPE_EAGLE3,
|
| COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,
|
| COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,
|
| COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V,
|
| COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
|
| COMMON_SPECULATIVE_TYPE_NGRAM_CACHE,
|
| COMMON_SPECULATIVE_TYPE_COUNT
|
| };
|
|
|
|
|
| struct common_params_sampling {
|
| uint32_t seed = LLAMA_DEFAULT_SEED;
|
|
|
| int32_t n_prev = 64;
|
| int32_t n_probs = 0;
|
| int32_t min_keep = 0;
|
| int32_t top_k = 40;
|
| float top_p = 0.95f;
|
| float min_p = 0.05f;
|
| float xtc_probability = 0.00f;
|
| float xtc_threshold = 0.10f;
|
| float typ_p = 1.00f;
|
| float temp = 0.80f;
|
| float dynatemp_range = 0.00f;
|
| float dynatemp_exponent = 1.00f;
|
| int32_t penalty_last_n = 64;
|
| float penalty_repeat = 1.00f;
|
| float penalty_freq = 0.00f;
|
| float penalty_present = 0.00f;
|
| float dry_multiplier = 0.0f;
|
| float dry_base = 1.75f;
|
| int32_t dry_allowed_length = 2;
|
| int32_t dry_penalty_last_n = -1;
|
| float adaptive_target = -1.0f;
|
| float adaptive_decay = 0.90f;
|
| int32_t mirostat = 0;
|
| float top_n_sigma = -1.00f;
|
| float mirostat_tau = 5.00f;
|
| float mirostat_eta = 0.10f;
|
| bool ignore_eos = false;
|
| bool no_perf = false;
|
| bool timing_per_token = false;
|
|
|
| uint64_t user_sampling_config = 0;
|
|
|
| std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};
|
|
|
| std::vector<enum common_sampler_type> samplers = {
|
| COMMON_SAMPLER_TYPE_PENALTIES,
|
| COMMON_SAMPLER_TYPE_DRY,
|
| COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
|
| COMMON_SAMPLER_TYPE_TOP_K,
|
| COMMON_SAMPLER_TYPE_TYPICAL_P,
|
| COMMON_SAMPLER_TYPE_TOP_P,
|
| COMMON_SAMPLER_TYPE_MIN_P,
|
| COMMON_SAMPLER_TYPE_XTC,
|
| COMMON_SAMPLER_TYPE_TEMPERATURE,
|
| };
|
|
|
| std::string grammar;
|
| bool grammar_lazy = false;
|
| std::vector<common_grammar_trigger> grammar_triggers;
|
| std::set<llama_token> preserved_tokens;
|
|
|
| std::vector<llama_logit_bias> logit_bias;
|
| std::vector<llama_logit_bias> logit_bias_eog;
|
|
|
| bool backend_sampling = false;
|
|
|
| bool has_logit_bias() const {
|
| return !logit_bias.empty();
|
| }
|
|
|
|
|
| std::string print() const;
|
| };
|
|
|
| struct common_params_model {
|
| std::string path = "";
|
| std::string url = "";
|
| std::string hf_repo = "";
|
| std::string hf_file = "";
|
| std::string docker_repo = "";
|
| std::string name = "";
|
| };
|
|
|
| struct common_ngram_mod;
|
|
|
| struct common_params_speculative {
|
| common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE;
|
|
|
|
|
|
|
| int32_t n_max = 16;
|
| int32_t n_min = 0;
|
| float p_split = 0.1f;
|
| float p_min = 0.75f;
|
|
|
|
|
|
|
| uint16_t ngram_size_n = 12;
|
| uint16_t ngram_size_m = 48;
|
| uint16_t ngram_min_hits = 1;
|
|
|
| std::shared_ptr<common_ngram_mod> ngram_mod;
|
|
|
| std::string lookup_cache_static;
|
| std::string lookup_cache_dynamic;
|
|
|
|
|
|
|
| struct common_params_model mparams_dft;
|
|
|
| llama_model * model_dft = nullptr;
|
|
|
| llama_context_params cparams_dft;
|
|
|
| int32_t n_ctx = 0;
|
| int32_t n_gpu_layers = -1;
|
|
|
| ggml_type cache_type_k = GGML_TYPE_F16;
|
| ggml_type cache_type_v = GGML_TYPE_F16;
|
|
|
| struct cpu_params cpuparams;
|
| struct cpu_params cpuparams_batch;
|
|
|
| std::vector<ggml_backend_dev_t> devices;
|
|
|
| std::vector<std::pair<std::string, std::string>> replacements;
|
| std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
|
|
| bool has_dft() const {
|
| return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
|
| }
|
| };
|
|
|
| struct common_params_vocoder {
|
| struct common_params_model model;
|
|
|
| std::string speaker_file = "";
|
|
|
| bool use_guide_tokens = false;
|
| };
|
|
|
| struct common_params_diffusion {
|
| int32_t steps = 128;
|
| bool visual_mode = false;
|
|
|
| float eps = 0;
|
| int32_t block_length = 0;
|
|
|
| int32_t algorithm = 4;
|
| float alg_temp = 0.0f;
|
|
|
| float cfg_scale = 0;
|
| bool add_gumbel_noise = false;
|
| };
|
|
|
|
|
|
|
| enum common_reasoning_format {
|
| COMMON_REASONING_FORMAT_NONE,
|
| COMMON_REASONING_FORMAT_AUTO,
|
| COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY,
|
| COMMON_REASONING_FORMAT_DEEPSEEK,
|
|
|
|
|
|
|
| };
|
|
|
|
|
| struct lr_opt {
|
| float lr0 = 1e-5;
|
| float lr_min = -1;
|
| float decay_epochs = -1;
|
| float scale_epoch = 0;
|
| float wd = 0;
|
| unsigned epochs = 2;
|
|
|
| unsigned epoch;
|
|
|
| float get_lr(float e) const;
|
| float get_lr() const { return get_lr(epoch); }
|
|
|
| void init();
|
| };
|
|
|
| struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
|
|
| struct common_params {
|
| int32_t n_predict = -1;
|
| int32_t n_ctx = 0;
|
| int32_t n_batch = 2048;
|
| int32_t n_ubatch = 512;
|
| int32_t n_keep = 0;
|
| int32_t n_chunks = -1;
|
| int32_t n_parallel = 1;
|
| int32_t n_sequences = 1;
|
| int32_t grp_attn_n = 1;
|
| int32_t grp_attn_w = 512;
|
| int32_t n_print = -1;
|
| float rope_freq_base = 0.0f;
|
| float rope_freq_scale = 0.0f;
|
| float yarn_ext_factor = -1.0f;
|
| float yarn_attn_factor = -1.0f;
|
| float yarn_beta_fast = -1.0f;
|
| float yarn_beta_slow = -1.0f;
|
| int32_t yarn_orig_ctx = 0;
|
|
|
|
|
| std::vector<ggml_backend_dev_t> devices;
|
|
|
| int32_t n_gpu_layers = -1;
|
| int32_t main_gpu = 0;
|
| float tensor_split[128] = {0};
|
| bool fit_params = true;
|
| int32_t fit_params_min_ctx = 4096;
|
|
|
|
|
| std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
|
|
|
| enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER;
|
|
|
| struct cpu_params cpuparams;
|
| struct cpu_params cpuparams_batch;
|
|
|
| ggml_backend_sched_eval_callback cb_eval = nullptr;
|
| void * cb_eval_user_data = nullptr;
|
|
|
| ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
|
|
| enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
| enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED;
|
| enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED;
|
| enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
|
|
|
| struct common_params_sampling sampling;
|
| struct common_params_speculative speculative;
|
| struct common_params_vocoder vocoder;
|
| struct common_params_diffusion diffusion;
|
|
|
| struct common_params_model model;
|
|
|
| std::set<std::string> model_alias;
|
| std::set<std::string> model_tags;
|
| std::string hf_token = "";
|
| std::string prompt = "";
|
| std::string system_prompt = "";
|
| std::string prompt_file = "";
|
| std::string path_prompt_cache = "";
|
| std::string input_prefix = "";
|
| std::string input_suffix = "";
|
| std::string logits_file = "";
|
|
|
|
|
| std::string logits_output_dir = "data";
|
| bool save_logits = false;
|
| std::vector<std::string> tensor_filter;
|
|
|
| std::vector<std::string> in_files;
|
| std::vector<std::string> antiprompt;
|
| std::vector<llama_model_kv_override> kv_overrides;
|
| std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
|
|
| bool lora_init_without_apply = false;
|
| std::vector<common_adapter_lora_info> lora_adapters;
|
|
|
| std::vector<common_control_vector_load_info> control_vectors;
|
|
|
| int32_t verbosity = 3;
|
| int32_t control_vector_layer_start = -1;
|
| int32_t control_vector_layer_end = -1;
|
| bool offline = false;
|
|
|
| int32_t ppl_stride = 0;
|
| int32_t ppl_output_type = 0;
|
|
|
|
|
| bool hellaswag = false;
|
| size_t hellaswag_tasks = 400;
|
|
|
| bool winogrande = false;
|
| size_t winogrande_tasks = 0;
|
|
|
| bool multiple_choice = false;
|
| size_t multiple_choice_tasks = 0;
|
|
|
| bool kl_divergence = false;
|
|
|
| bool usage = false;
|
| bool completion = false;
|
| bool use_color = false;
|
| bool special = false;
|
| bool interactive = false;
|
| bool interactive_first = false;
|
| bool prompt_cache_all = false;
|
| bool prompt_cache_ro = false;
|
|
|
| bool escape = true;
|
| bool multiline_input = false;
|
| bool simple_io = false;
|
| bool cont_batching = true;
|
| bool no_perf = false;
|
| bool show_timings = true;
|
| bool ctx_shift = false;
|
| bool swa_full = false;
|
| bool kv_unified = false;
|
|
|
| bool input_prefix_bos = false;
|
| bool use_mmap = true;
|
| bool use_direct_io = false;
|
| bool use_mlock = false;
|
| bool verbose_prompt = false;
|
| bool display_prompt = true;
|
| bool no_kv_offload = false;
|
| bool warmup = true;
|
| bool check_tensors = false;
|
| bool no_op_offload = false;
|
| bool no_extra_bufts = false;
|
| bool no_host = false;
|
|
|
| bool single_turn = false;
|
|
|
| ggml_type cache_type_k = GGML_TYPE_F16;
|
| ggml_type cache_type_v = GGML_TYPE_F16;
|
|
|
| common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
|
|
|
|
| struct common_params_model mmproj;
|
| bool mmproj_use_gpu = true;
|
| bool no_mmproj = false;
|
| std::vector<std::string> image;
|
| int image_min_tokens = -1;
|
| int image_max_tokens = -1;
|
|
|
|
|
| struct lr_opt lr;
|
| enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
| float val_split = 0.05f;
|
|
|
|
|
| bool embedding = false;
|
| int32_t embd_normalize = 2;
|
| std::string embd_out = "";
|
| std::string embd_sep = "\n";
|
| std::string cls_sep = "\t";
|
|
|
|
|
| int32_t port = 8080;
|
| int32_t timeout_read = 600;
|
| int32_t timeout_write = timeout_read;
|
| int32_t n_threads_http = -1;
|
| int32_t n_cache_reuse = 0;
|
| bool cache_prompt = true;
|
| int32_t n_ctx_checkpoints = 8;
|
| int32_t cache_ram_mib = 8192;
|
|
|
| std::string hostname = "127.0.0.1";
|
| std::string public_path = "";
|
| std::string api_prefix = "";
|
| std::string chat_template = "";
|
| bool use_jinja = true;
|
| bool enable_chat_template = true;
|
| common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
| int reasoning_budget = -1;
|
| bool prefill_assistant = true;
|
| int sleep_idle_seconds = -1;
|
|
|
| std::vector<std::string> api_keys;
|
|
|
| std::string ssl_file_key = "";
|
| std::string ssl_file_cert = "";
|
|
|
| std::map<std::string, std::string> default_template_kwargs;
|
|
|
|
|
| bool webui = true;
|
| std::string webui_config_json;
|
|
|
|
|
| bool endpoint_slots = true;
|
| bool endpoint_props = false;
|
| bool endpoint_metrics = false;
|
|
|
|
|
| std::string models_dir = "";
|
| std::string models_preset = "";
|
| int models_max = 4;
|
| bool models_autoload = true;
|
|
|
| bool log_json = false;
|
|
|
| std::string slot_save_path;
|
| std::string media_path;
|
|
|
| float slot_prompt_similarity = 0.1f;
|
|
|
|
|
| bool is_pp_shared = false;
|
| bool is_tg_separate = false;
|
|
|
| std::vector<int32_t> n_pp;
|
| std::vector<int32_t> n_tg;
|
| std::vector<int32_t> n_pl;
|
|
|
|
|
| std::vector<std::string> context_files;
|
|
|
| int32_t chunk_size = 64;
|
|
|
| std::string chunk_separator = "\n";
|
|
|
|
|
| int32_t n_junk = 250;
|
| int32_t i_pos = -1;
|
|
|
|
|
| int32_t n_out_freq = 10;
|
| int32_t n_save_freq = 0;
|
| int32_t i_chunk = 0;
|
| int8_t imat_dat = 0;
|
|
|
| bool process_output = false;
|
| bool compute_ppl = true;
|
| bool show_statistics = false;
|
| bool parse_special = false;
|
|
|
|
|
| int n_pca_batch = 100;
|
| int n_pca_iterations = 1000;
|
| dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
| std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
|
| std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
|
|
|
| bool spm_infill = false;
|
|
|
|
|
| bool batched_bench_output_jsonl = false;
|
|
|
|
|
| std::string out_file;
|
|
|
|
|
|
|
| llama_progress_callback load_progress_callback = NULL;
|
| void * load_progress_callback_user_data = NULL;
|
| };
|
|
|
|
|
|
|
| void common_init();
|
|
|
| std::string common_params_get_system_info(const common_params & params);
|
|
|
| bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
| bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
| void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
|
| bool set_process_priority(enum ggml_sched_priority prio);
|
|
|
|
|
|
|
|
|
|
|
| #ifdef __GNUC__
|
| # if defined(__MINGW32__) && !defined(__clang__)
|
| # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
| # else
|
| # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
| # endif
|
| #else
|
| # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
| #endif
|
|
|
| LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
| std::string string_format(const char * fmt, ...);
|
|
|
| std::string string_strip(const std::string & str);
|
| std::string string_get_sortable_timestamp();
|
|
|
| std::string string_join(const std::vector<std::string> & values, const std::string & separator);
|
| std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
|
| std::string string_repeat(const std::string & str, size_t n);
|
|
|
| void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
|
|
|
| std::string regex_escape(const std::string & s);
|
|
|
| template<class T>
|
| static std::vector<T> string_split(const std::string & str, char delim) {
|
| static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
|
| std::vector<T> values;
|
| std::istringstream str_stream(str);
|
| std::string token;
|
| while (std::getline(str_stream, token, delim)) {
|
| T value;
|
| std::istringstream token_stream(token);
|
| token_stream >> value;
|
| values.push_back(value);
|
| }
|
| return values;
|
| }
|
|
|
| template<>
|
| inline std::vector<std::string> string_split<std::string>(const std::string & str, char delim)
|
| {
|
| std::vector<std::string> parts;
|
| size_t begin_pos = 0;
|
| size_t delim_pos = str.find(delim);
|
| while (delim_pos != std::string::npos) {
|
| std::string part = str.substr(begin_pos, delim_pos - begin_pos);
|
| parts.emplace_back(part);
|
| begin_pos = delim_pos + 1;
|
| delim_pos = str.find(delim, begin_pos);
|
| }
|
| parts.emplace_back(str.substr(begin_pos));
|
| return parts;
|
| }
|
|
|
|
|
| inline bool string_starts_with(std::string_view str, std::string_view prefix) {
|
| return str.size() >= prefix.size() &&
|
| str.compare(0, prefix.size(), prefix) == 0;
|
| }
|
|
|
|
|
| inline bool string_ends_with(std::string_view str, std::string_view suffix) {
|
| return str.size() >= suffix.size() &&
|
| str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
|
| }
|
|
|
| inline bool string_remove_suffix(std::string & str, std::string_view suffix) {
|
| if (string_ends_with(str, suffix)) {
|
| str.resize(str.size() - suffix.size());
|
| return true;
|
| }
|
| return false;
|
| }
|
|
|
| inline size_t string_find_partial_stop(std::string_view str, std::string_view stop) {
|
| if (!str.empty() && !stop.empty()) {
|
| const size_t max_len = std::min(str.size(), stop.size());
|
| const char last_char = str.back();
|
| for (size_t len = max_len; len > 0; --len) {
|
| if (stop[len - 1] == last_char) {
|
| if (string_ends_with(str, stop.substr(0, len))) {
|
| return str.size() - len;
|
| }
|
| }
|
| }
|
| }
|
| return std::string::npos;
|
| }
|
|
|
| bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
| void string_process_escapes(std::string & input);
|
|
|
| std::string string_from(bool value);
|
| std::string string_from(const std::vector<int> & values);
|
| std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
|
| std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
|
|
|
|
|
|
|
|
|
|
|
| bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
|
| bool fs_create_directory_with_parents(const std::string & path);
|
| bool fs_is_directory(const std::string & path);
|
|
|
| std::string fs_get_cache_directory();
|
| std::string fs_get_cache_file(const std::string & filename);
|
|
|
| struct common_file_info {
|
| std::string path;
|
| std::string name;
|
| size_t size = 0;
|
| bool is_dir = false;
|
| };
|
| std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
|
|
|
|
|
|
|
|
|
|
|
|
|
| bool tty_can_use_colors();
|
|
|
|
|
|
|
|
|
|
|
| struct common_sampler;
|
|
|
|
|
| struct common_init_result {
|
| common_init_result(common_params & params);
|
| ~common_init_result();
|
|
|
| llama_model * model();
|
| llama_context * context();
|
|
|
| common_sampler * sampler(llama_seq_id seq_id);
|
| void reset_samplers();
|
|
|
| std::vector<llama_adapter_lora_ptr> & lora();
|
|
|
| private:
|
| struct impl;
|
| std::unique_ptr<impl> pimpl;
|
| };
|
|
|
| using common_init_result_ptr = std::unique_ptr<common_init_result>;
|
|
|
| common_init_result_ptr common_init_from_params(common_params & params);
|
|
|
| struct llama_model_params common_model_params_to_llama ( common_params & params);
|
| struct llama_context_params common_context_params_to_llama(const common_params & params);
|
| struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
|
|
|
|
| void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
|
|
|
| std::string get_model_endpoint();
|
|
|
|
|
|
|
|
|
|
|
| void common_batch_clear(struct llama_batch & batch);
|
|
|
| void common_batch_add(
|
| struct llama_batch & batch,
|
| llama_token id,
|
| llama_pos pos,
|
| const std::vector<llama_seq_id> & seq_ids,
|
| bool logits);
|
|
|
|
|
|
|
|
|
|
|
|
|
| bool common_prompt_batch_decode(
|
| struct llama_context * ctx,
|
| const std::vector<llama_token> & embd,
|
| int & n_past,
|
| int n_batch,
|
| std::string_view state_path,
|
| bool save_state);
|
|
|
|
|
|
|
| bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| std::vector<llama_token> common_tokenize(
|
| const struct llama_context * ctx,
|
| const std::string & text,
|
| bool add_special,
|
| bool parse_special = false);
|
|
|
| std::vector<llama_token> common_tokenize(
|
| const struct llama_vocab * vocab,
|
| const std::string & text,
|
| bool add_special,
|
| bool parse_special = false);
|
|
|
|
|
|
|
| std::string common_token_to_piece(
|
| const struct llama_context * ctx,
|
| llama_token token,
|
| bool special = true);
|
|
|
| std::string common_token_to_piece(
|
| const struct llama_vocab * vocab,
|
| llama_token token,
|
| bool special = true);
|
|
|
|
|
|
|
|
|
| std::string common_detokenize(
|
| const struct llama_context * ctx,
|
| const std::vector<llama_token> & tokens,
|
| bool special = true);
|
|
|
| std::string common_detokenize(
|
| const struct llama_vocab * vocab,
|
| const std::vector<llama_token> & tokens,
|
| bool special = true);
|
|
|
|
|
|
|
|
|
|
|
|
|
| void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
|
|
|
| float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
|
|
|
|
|
|
|
|
|
|
| struct common_control_vector_data {
|
| int n_embd;
|
|
|
|
|
| std::vector<float> data;
|
| };
|
|
|
| struct common_control_vector_load_info {
|
| float strength;
|
|
|
| std::string fname;
|
| };
|
|
|
|
|
|
|
| common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
|
|
|
|
|
|
|
|
|
|
|
| namespace {
|
|
|
| const char * const LLM_KV_SPLIT_NO = "split.no";
|
| const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
| const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
|
|
| }
|
|
|
|
|
|
|
|
|
|
|
| const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
|
|
|
| inline std::string llm_ffn_exps_block_regex(int idx) {
|
| return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
|
| }
|
|
|
| inline llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
|
| return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
|
| }
|
|
|
|
|
|
|
|
|
|
|
| ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
|
|
|
|
|
| enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
|
|
|