File size: 2,816 Bytes
1ce325b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#ifndef LM_FILTER_ARPA_IO_H
#define LM_FILTER_ARPA_IO_H
/* Input and output for ARPA format language model files.
 */
#include "../read_arpa.hh"
#include "../../util/exception.hh"
#include "../../util/file_stream.hh"
#include "../../util/string_piece.hh"
#include "../../util/tokenize_piece.hh"

#include <boost/noncopyable.hpp>
#include <boost/scoped_array.hpp>

#include <fstream>
#include <string>
#include <vector>

#include <cstring>
#include <stdint.h>

namespace util { class FilePiece; }

namespace lm {

class ARPAInputException : public util::Exception {
  public:
    explicit ARPAInputException(const StringPiece &message) throw();
    explicit ARPAInputException(const StringPiece &message, const StringPiece &line) throw();
    virtual ~ARPAInputException() throw();
};

// Handling for the counts of n-grams at the beginning of ARPA files.
size_t SizeNeededForCounts(const std::vector<uint64_t> &number);

/* Writes an ARPA file.  This has to be seekable so the counts can be written
 * at the end.  Hence, I just have it own a std::fstream instead of accepting
 * a separately held std::ostream.  TODO: use the fast one from estimation.
 */
class ARPAOutput : boost::noncopyable {
  public:
    explicit ARPAOutput(const char *name, size_t buffer_size = 65536);

    void ReserveForCounts(std::streampos reserve);

    void BeginLength(unsigned int length);

    void AddNGram(const StringPiece &line) {
      file_ << line << '\n';
      ++fast_counter_;
    }

    void AddNGram(const StringPiece &ngram, const StringPiece &line) {
      AddNGram(line);
    }

    template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) {
      AddNGram(line);
    }

    void EndLength(unsigned int length);

    void Finish();

  private:
    util::scoped_fd file_backing_;
    util::FileStream file_;
    uint64_t fast_counter_;
    std::vector<uint64_t> counts_;
};


template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length, uint64_t number, Output &out) {
  ReadNGramHeader(in, length);
  out.BeginLength(length);
  for (uint64_t i = 0; i < number; ++i) {
    StringPiece line = in.ReadLine();
    util::TokenIter<util::SingleCharacter> tabber(line, '\t');
    if (!tabber) throw ARPAInputException("blank line", line);
    if (!++tabber) throw ARPAInputException("no tab", line);

    out.AddNGram(*tabber, line);
  }
  out.EndLength(length);
}

template <class Output> void ReadARPA(util::FilePiece &in_lm, Output &out) {
  std::vector<uint64_t> number;
  ReadARPACounts(in_lm, number);
  out.ReserveForCounts(SizeNeededForCounts(number));
  for (unsigned int i = 0; i < number.size(); ++i) {
    ReadNGrams(in_lm, i + 1, number[i], out);
  }
  ReadEnd(in_lm);
  out.Finish();
}

} // namespace lm

#endif // LM_FILTER_ARPA_IO_H