Spaces:

aelitta
/

BioMistral_gradio

Running

App Files Files Community

BioMistral_gradio / llama-cpp-python /vendor /llama.cpp /common /ngram-cache.h

aelitta

Upload folder using huggingface_hub

4bdb245 verified 3 months ago

raw

history blame contribute delete

3.73 kB

	#pragma once

	#include "llama.h"

	#include <unordered_map>
	#include <string>
	#include <vector>

	#define LLAMA_NGRAM_MIN 1
	#define LLAMA_NGRAM_MAX 4
	#define LLAMA_NGRAM_STATIC 2

	// Data structures to map n-grams to empirical token probabilities:

	struct llama_ngram {
	llama_token tokens[LLAMA_NGRAM_MAX];

	llama_ngram() {
	for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
	tokens[i] = -1;
	}
	}

	llama_ngram(const llama_token * input, const int ngram_size) {
	for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
	tokens[i] = i < ngram_size ? input[i] : -1;
	}
	}

	bool operator==(const llama_ngram & other) const {
	for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
	if (tokens[i] != other.tokens[i]) {
	return false;
	}
	}
	return true;
	}
	};

	struct llama_ngram_hash_function {
	size_t operator()(const llama_ngram & ngram) const {
	size_t hash = 0;
	for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
	hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
	}
	return hash;
	}
	};

	// token -> number of times token has been seen
	typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;

	// n-gram -> empirical distribution of following tokens
	typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;


	// Update an ngram cache with tokens.
	// ngram_cache: the cache to modify.
	// ngram_min/ngram_max: the min/max size of the ngrams to extract from inp_data.
	// inp_data: the token sequence with which to update ngram_cache.
	// nnew: how many new tokens have been appended to inp_data since the last call to this function.
	// print_progress: whether to print progress to stderr.
	//
	// In order to get correct results inp_data can ONLY BE APPENDED TO.
	// Changes in the middle need a complete rebuild.
	void llama_ngram_cache_update(
	llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);

	// Try to draft tokens from ngram caches.
	// inp: the tokens generated so far.
	// draft: the token sequence to draft. Expected to initially contain the previously sampled token.
	// n_draft: maximum number of tokens to add to draft.
	// ngram_min/gram_max: the min/max size of the ngrams in nc_context and nc_dynamic.
	// nc_context: ngram cache based on current context.
	// nc_dynamic: ngram cache based on previous user generations.
	// nc_static: ngram cache generated from a large text corpus, used for validation.
	void llama_ngram_cache_draft(
	std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
	llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);

	// Save an ngram cache to a file.
	// ngram_cache: the ngram cache to save.
	// filename: the path under which to save the ngram cache.
	void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);

	// Load an ngram cache saved with llama_ngram_cache_save.
	// filename: the path from which to load the ngram cache.
	// returns: an ngram cache containing the information saved to filename.
	llama_ngram_cache llama_ngram_cache_load(std::string & filename);

	// Merge two ngram caches.
	// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
	// ngram_cache_add: the ngram cache to add to ngram_cache_target.
	void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);