#ifndef LM_FILTER_ARPA_IO_H #define LM_FILTER_ARPA_IO_H /* Input and output for ARPA format language model files. */ #include "../read_arpa.hh" #include "../../util/exception.hh" #include "../../util/file_stream.hh" #include "../../util/string_piece.hh" #include "../../util/tokenize_piece.hh" #include #include #include #include #include #include #include namespace util { class FilePiece; } namespace lm { class ARPAInputException : public util::Exception { public: explicit ARPAInputException(const StringPiece &message) throw(); explicit ARPAInputException(const StringPiece &message, const StringPiece &line) throw(); virtual ~ARPAInputException() throw(); }; // Handling for the counts of n-grams at the beginning of ARPA files. size_t SizeNeededForCounts(const std::vector &number); /* Writes an ARPA file. This has to be seekable so the counts can be written * at the end. Hence, I just have it own a std::fstream instead of accepting * a separately held std::ostream. TODO: use the fast one from estimation. */ class ARPAOutput : boost::noncopyable { public: explicit ARPAOutput(const char *name, size_t buffer_size = 65536); void ReserveForCounts(std::streampos reserve); void BeginLength(unsigned int length); void AddNGram(const StringPiece &line) { file_ << line << '\n'; ++fast_counter_; } void AddNGram(const StringPiece &ngram, const StringPiece &line) { AddNGram(line); } template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { AddNGram(line); } void EndLength(unsigned int length); void Finish(); private: util::scoped_fd file_backing_; util::FileStream file_; uint64_t fast_counter_; std::vector counts_; }; template void ReadNGrams(util::FilePiece &in, unsigned int length, uint64_t number, Output &out) { ReadNGramHeader(in, length); out.BeginLength(length); for (uint64_t i = 0; i < number; ++i) { StringPiece line = in.ReadLine(); util::TokenIter tabber(line, '\t'); if (!tabber) throw ARPAInputException("blank line", line); if (!++tabber) throw ARPAInputException("no tab", line); out.AddNGram(*tabber, line); } out.EndLength(length); } template void ReadARPA(util::FilePiece &in_lm, Output &out) { std::vector number; ReadARPACounts(in_lm, number); out.ReserveForCounts(SizeNeededForCounts(number)); for (unsigned int i = 0; i < number.size(); ++i) { ReadNGrams(in_lm, i + 1, number[i], out); } ReadEnd(in_lm); out.Finish(); } } // namespace lm #endif // LM_FILTER_ARPA_IO_H