File size: 2,155 Bytes
1ce325b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#ifndef LM_COMMON_MODEL_BUFFER_H
#define LM_COMMON_MODEL_BUFFER_H

/* Format with separate files in suffix order.  Each file contains
 * n-grams of the same order.
 */
#include "../word_index.hh"
#include "../../util/file.hh"
#include "../../util/fixed_array.hh"
#include "../../util/string_piece.hh"

#include <string>
#include <vector>

namespace util { namespace stream {
class Chains;
class Chain;
}} // namespaces

namespace lm {

namespace ngram { class State; }

class ModelBuffer {
  public:
    // Construct for writing.  Must call VocabFile() and fill it with null-delimited vocab words.
    ModelBuffer(StringPiece file_base, bool keep_buffer, bool output_q);

    // Load from file.
    explicit ModelBuffer(StringPiece file_base);

    // Must call VocabFile and populate before calling this function.
    void Sink(util::stream::Chains &chains, const std::vector<uint64_t> &counts);

    // Read files and write to the given chains.  If fewer chains are provided,
    // only do the lower orders.
    void Source(util::stream::Chains &chains);

    void Source(std::size_t order_minus_1, util::stream::Chain &chain);

    // The order of the n-gram model that is associated with the model buffer.
    std::size_t Order() const { return counts_.size(); }
    // Requires Sink or load from file.
    const std::vector<uint64_t> &Counts() const {
      assert(!counts_.empty());
      return counts_;
    }

    int VocabFile() const { return vocab_file_.get(); }

    int RawFile(std::size_t order_minus_1) const {
      return files_[order_minus_1].get();
    }

    bool Keep() const { return keep_buffer_; }

    // Slowly execute a language model query with binary search.
    // This is used by interpolation to gather tuning probabilities rather than
    // scanning the files.
    float SlowQuery(const ngram::State &context, WordIndex word, ngram::State &out) const;

  private:
    const std::string file_base_;
    const bool keep_buffer_;
    bool output_q_;
    std::vector<uint64_t> counts_;

    util::scoped_fd vocab_file_;
    util::FixedArray<util::scoped_fd> files_;
};

} // namespace lm

#endif // LM_COMMON_MODEL_BUFFER_H