File size: 2,155 Bytes
1ce325b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
#ifndef LM_COMMON_MODEL_BUFFER_H
#define LM_COMMON_MODEL_BUFFER_H
/* Format with separate files in suffix order. Each file contains
* n-grams of the same order.
*/
#include "../word_index.hh"
#include "../../util/file.hh"
#include "../../util/fixed_array.hh"
#include "../../util/string_piece.hh"
#include <string>
#include <vector>
namespace util { namespace stream {
class Chains;
class Chain;
}} // namespaces
namespace lm {
namespace ngram { class State; }
class ModelBuffer {
public:
// Construct for writing. Must call VocabFile() and fill it with null-delimited vocab words.
ModelBuffer(StringPiece file_base, bool keep_buffer, bool output_q);
// Load from file.
explicit ModelBuffer(StringPiece file_base);
// Must call VocabFile and populate before calling this function.
void Sink(util::stream::Chains &chains, const std::vector<uint64_t> &counts);
// Read files and write to the given chains. If fewer chains are provided,
// only do the lower orders.
void Source(util::stream::Chains &chains);
void Source(std::size_t order_minus_1, util::stream::Chain &chain);
// The order of the n-gram model that is associated with the model buffer.
std::size_t Order() const { return counts_.size(); }
// Requires Sink or load from file.
const std::vector<uint64_t> &Counts() const {
assert(!counts_.empty());
return counts_;
}
int VocabFile() const { return vocab_file_.get(); }
int RawFile(std::size_t order_minus_1) const {
return files_[order_minus_1].get();
}
bool Keep() const { return keep_buffer_; }
// Slowly execute a language model query with binary search.
// This is used by interpolation to gather tuning probabilities rather than
// scanning the files.
float SlowQuery(const ngram::State &context, WordIndex word, ngram::State &out) const;
private:
const std::string file_base_;
const bool keep_buffer_;
bool output_q_;
std::vector<uint64_t> counts_;
util::scoped_fd vocab_file_;
util::FixedArray<util::scoped_fd> files_;
};
} // namespace lm
#endif // LM_COMMON_MODEL_BUFFER_H
|