#ifndef LM_BUILDER_PIPELINE_H #define LM_BUILDER_PIPELINE_H #include "adjust_counts.hh" #include "initial_probabilities.hh" #include "header_info.hh" #include "../lm_exception.hh" #include "../word_index.hh" #include "../../util/stream/config.hh" #include "../../util/file_piece.hh" #include #include namespace lm { namespace builder { class Output; struct PipelineConfig { std::size_t order; util::stream::SortConfig sort; InitialProbabilitiesConfig initial_probs; util::stream::ChainConfig read_backoffs; // Estimated vocabulary size. Used for sizing CorpusCount memory and // initial probing hash table sizing, also in CorpusCount. lm::WordIndex vocab_estimate; // Minimum block size to tolerate. std::size_t minimum_block; // Number of blocks to use. This will be overridden to 1 if everything fits. std::size_t block_count; // n-gram count thresholds for pruning. 0 values means no pruning for // corresponding n-gram order std::vector prune_thresholds; //mjd bool prune_vocab; std::string prune_vocab_file; /* Renumber the vocabulary the way the trie likes it? */ bool renumber_vocabulary; // What to do with discount failures. DiscountConfig discount; // Compute collapsed q values instead of probability and backoff bool output_q; /* Computing the perplexity of LMs with different vocabularies is hard. For * example, the lowest perplexity is attained by a unigram model that * predicts p() = 1 and has no other vocabulary. Also, linearly * interpolated models will sum to more than 1 because is duplicated * (SRI just pretends p() = 0 for these purposes, which makes it sum to * 1 but comes with its own problems). This option will make the vocabulary * a particular size by replicating multiple times for purposes of * computing vocabulary size. It has no effect if the actual vocabulary is * larger. This parameter serves the same purpose as IRSTLM's "dub". */ uint64_t vocab_size_for_unk; /* What to do the first time , , or appears in the input. If * this is anything but THROW_UP, then the symbol will always be treated as * whitespace. */ WarningAction disallowed_symbol_action; const std::string &TempPrefix() const { return sort.temp_prefix; } std::size_t TotalMemory() const { return sort.total_memory; } }; // Takes ownership of text_file and out_arpa. void Pipeline(PipelineConfig &config, int text_file, Output &output); }} // namespaces #endif // LM_BUILDER_PIPELINE_H