File size: 1,890 Bytes
1ce325b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#include "print.hh"

#include "ngram_stream.hh"
#include "../../util/file_stream.hh"
#include "../../util/file.hh"
#include "../../util/mmap.hh"
#include "../../util/scoped.hh"

#include <sstream>
#include <cstring>

namespace lm {

VocabReconstitute::VocabReconstitute(int fd) {
  uint64_t size = util::SizeOrThrow(fd);
  util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_);
  const char *const start = static_cast<const char*>(memory_.get());
  const char *i;
  for (i = start; i != start + size; i += strlen(i) + 1) {
    map_.push_back(i);
  }
  // Last one for LookupPiece.
  map_.push_back(i);
}

namespace {
template <class Payload> void PrintLead(const VocabReconstitute &vocab, ProxyStream<Payload> &stream, util::FileStream &out) {
  out << stream->Value().prob << '\t' << vocab.Lookup(*stream->begin());
  for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
    out << ' ' << vocab.Lookup(*i);
  }
}
} // namespace

void PrintARPA::Run(const util::stream::ChainPositions &positions) {
  VocabReconstitute vocab(vocab_fd_);
  util::FileStream out(out_fd_);
  out << "\\data\\\n";
  for (size_t i = 0; i < positions.size(); ++i) {
    out << "ngram " << (i+1) << '=' << counts_[i] << '\n';
  }
  out << '\n';

  for (unsigned order = 1; order < positions.size(); ++order) {
    out << "\\" << order << "-grams:" << '\n';
    for (ProxyStream<NGram<ProbBackoff> > stream(positions[order - 1], NGram<ProbBackoff>(NULL, order)); stream; ++stream) {
      PrintLead(vocab, stream, out);
      out << '\t' << stream->Value().backoff << '\n';
    }
    out << '\n';
  }

  out << "\\" << positions.size() << "-grams:" << '\n';
  for (ProxyStream<NGram<Prob> > stream(positions.back(), NGram<Prob>(NULL, positions.size())); stream; ++stream) {
    PrintLead(vocab, stream, out);
    out << '\n';
  }
  out << '\n';
  out << "\\end\\\n";
}

} // namespace lm