File size: 1,701 Bytes
1ce325b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#ifndef LM_BUILDER_CORPUS_COUNT_H
#define LM_BUILDER_CORPUS_COUNT_H

#include "../lm_exception.hh"
#include "../word_index.hh"
#include "../../util/scoped.hh"

#include <cstddef>
#include <string>
#include <stdint.h>
#include <vector>

namespace util {
class FilePiece;
namespace stream {
class ChainPosition;
} // namespace stream
} // namespace util

namespace lm {
namespace builder {

class CorpusCount {
  public:
    // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size
    static float DedupeMultiplier(std::size_t order);

    // How much memory vocabulary will use based on estimated size of the vocab.
    static std::size_t VocabUsage(std::size_t vocab_estimate);

    // token_count: out.
    // type_count aka vocabulary size.  Initialize to an estimate.  It is set to the exact value.
    CorpusCount(util::FilePiece &from, int vocab_write, bool dynamic_vocab, uint64_t &token_count, WordIndex &type_count, std::vector<bool> &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol);

    void Run(const util::stream::ChainPosition &position);

  private:
    template <class Vocab> void RunWithVocab(const util::stream::ChainPosition &position, Vocab &vocab);

    util::FilePiece &from_;
    int vocab_write_;
    bool dynamic_vocab_;
    uint64_t &token_count_;
    WordIndex &type_count_;
    std::vector<bool> &prune_words_;
    const std::string prune_vocab_filename_;

    std::size_t dedupe_mem_size_;
    util::scoped_malloc dedupe_mem_;

    WarningAction disallowed_symbol_action_;
};

} // namespace builder
} // namespace lm
#endif // LM_BUILDER_CORPUS_COUNT_H