namespace lm { namespace builder { | |
class Output; | |
struct PipelineConfig { | |
std::size_t order; | |
util::stream::SortConfig sort; | |
InitialProbabilitiesConfig initial_probs; | |
util::stream::ChainConfig read_backoffs; | |
// Estimated vocabulary size. Used for sizing CorpusCount memory and | |
// initial probing hash table sizing, also in CorpusCount. | |
lm::WordIndex vocab_estimate; | |
// Minimum block size to tolerate. | |
std::size_t minimum_block; | |
// Number of blocks to use. This will be overridden to 1 if everything fits. | |
std::size_t block_count; | |
// n-gram count thresholds for pruning. 0 values means no pruning for | |
// corresponding n-gram order | |
std::vector<uint64_t> prune_thresholds; //mjd | |
bool prune_vocab; | |
std::string prune_vocab_file; | |
/* Renumber the vocabulary the way the trie likes it? */ | |
bool renumber_vocabulary; | |
// What to do with discount failures. | |
DiscountConfig discount; | |
// Compute collapsed q values instead of probability and backoff | |
bool output_q; | |
/* Computing the perplexity of LMs with different vocabularies is hard. For | |
* example, the lowest perplexity is attained by a unigram model that | |
* predicts p(<unk>) = 1 and has no other vocabulary. Also, linearly | |
* interpolated models will sum to more than 1 because <unk> is duplicated | |
* (SRI just pretends p(<unk>) = 0 for these purposes, which makes it sum to | |
* 1 but comes with its own problems). This option will make the vocabulary | |
* a particular size by replicating <unk> multiple times for purposes of | |
* computing vocabulary size. It has no effect if the actual vocabulary is | |
* larger. This parameter serves the same purpose as IRSTLM's "dub". | |
*/ | |
uint64_t vocab_size_for_unk; | |
/* What to do the first time <s>, </s>, or <unk> appears in the input. If | |
* this is anything but THROW_UP, then the symbol will always be treated as | |
* whitespace. | |
*/ | |
WarningAction disallowed_symbol_action; | |
const std::string &TempPrefix() const { return sort.temp_prefix; } | |
std::size_t TotalMemory() const { return sort.total_memory; } | |
}; | |
// Takes ownership of text_file and out_arpa. | |
void Pipeline(PipelineConfig &config, int text_file, Output &output); | |
}} // namespaces | |