File size: 1,235 Bytes
1ce325b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#include "split_worker.hh"
#include "../common/ngram.hh"

namespace lm {
namespace interpolate {

SplitWorker::SplitWorker(std::size_t order, util::stream::Chain &backoff_chain,
                         util::stream::Chain &sort_chain)
    : order_(order) {
  backoff_chain >> backoff_input_;
  sort_chain >> sort_input_;
}

void SplitWorker::Run(const util::stream::ChainPosition &position) {
  // input: ngram record (id, prob, and backoff)
  // output: a float to the backoff_input stream
  //         an ngram id and a float to the sort_input stream
  for (util::stream::Stream stream(position); stream; ++stream) {
    NGram<ProbBackoff> ngram(stream.Get(), order_);

    // write id and prob to the sort stream
    float prob = ngram.Value().prob;
    lm::WordIndex *out = reinterpret_cast<lm::WordIndex *>(sort_input_.Get());
    for (const lm::WordIndex *it = ngram.begin(); it != ngram.end(); ++it) {
      *out++ = *it;
    }
    *reinterpret_cast<float *>(out) = prob;
    ++sort_input_;

    // write backoff to the backoff output stream
    float boff = ngram.Value().backoff;
    *reinterpret_cast<float *>(backoff_input_.Get()) = boff;
    ++backoff_input_;
  }
  sort_input_.Poison();
  backoff_input_.Poison();
}

}
}