#ifndef LM_FILTER_FORMAT_H #define LM_FILTER_FORMAT_H #include "arpa_io.hh" #include "count_io.hh" #include #include #include namespace lm { template class MultipleOutput { private: typedef boost::ptr_vector Singles; typedef typename Singles::iterator SinglesIterator; public: MultipleOutput(const char *prefix, size_t number) { files_.reserve(number); std::string tmp; for (unsigned int i = 0; i < number; ++i) { tmp = prefix; tmp += boost::lexical_cast(i); files_.push_back(new Single(tmp.c_str())); } } void AddNGram(const StringPiece &line) { for (SinglesIterator i = files_.begin(); i != files_.end(); ++i) i->AddNGram(line); } template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { for (SinglesIterator i = files_.begin(); i != files_.end(); ++i) i->AddNGram(begin, end, line); } void SingleAddNGram(size_t offset, const StringPiece &line) { files_[offset].AddNGram(line); } template void SingleAddNGram(size_t offset, const Iterator &begin, const Iterator &end, const StringPiece &line) { files_[offset].AddNGram(begin, end, line); } protected: Singles files_; }; class MultipleARPAOutput : public MultipleOutput { public: MultipleARPAOutput(const char *prefix, size_t number) : MultipleOutput(prefix, number) {} void ReserveForCounts(std::streampos reserve) { for (boost::ptr_vector::iterator i = files_.begin(); i != files_.end(); ++i) i->ReserveForCounts(reserve); } void BeginLength(unsigned int length) { for (boost::ptr_vector::iterator i = files_.begin(); i != files_.end(); ++i) i->BeginLength(length); } void EndLength(unsigned int length) { for (boost::ptr_vector::iterator i = files_.begin(); i != files_.end(); ++i) i->EndLength(length); } void Finish() { for (boost::ptr_vector::iterator i = files_.begin(); i != files_.end(); ++i) i->Finish(); } }; template class DispatchInput { public: DispatchInput(Filter &filter, Output &output) : filter_(filter), output_(output) {} /* template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { filter_.AddNGram(begin, end, line, output_); }*/ void AddNGram(const StringPiece &ngram, const StringPiece &line) { filter_.AddNGram(ngram, line, output_); } protected: Filter &filter_; Output &output_; }; template class DispatchARPAInput : public DispatchInput { private: typedef DispatchInput B; public: DispatchARPAInput(Filter &filter, Output &output) : B(filter, output) {} void ReserveForCounts(std::streampos reserve) { B::output_.ReserveForCounts(reserve); } void BeginLength(unsigned int length) { B::output_.BeginLength(length); } void EndLength(unsigned int length) { B::filter_.Flush(); B::output_.EndLength(length); } void Finish() { B::output_.Finish(); } }; struct ARPAFormat { typedef ARPAOutput Output; typedef MultipleARPAOutput Multiple; static void Copy(util::FilePiece &in, Output &out) { ReadARPA(in, out); } template static void RunFilter(util::FilePiece &in, Filter &filter, Out &output) { DispatchARPAInput dispatcher(filter, output); ReadARPA(in, dispatcher); } }; struct CountFormat { typedef CountOutput Output; typedef MultipleOutput Multiple; static void Copy(util::FilePiece &in, Output &out) { ReadCount(in, out); } template static void RunFilter(util::FilePiece &in, Filter &filter, Out &output) { DispatchInput dispatcher(filter, output); ReadCount(in, dispatcher); } }; /* For multithreading, the buffer classes hold batches of filter inputs and * outputs in memory. The strings get reused a lot, so keep them around * instead of clearing each time. */ class InputBuffer { public: InputBuffer() : actual_(0) {} void Reserve(size_t size) { lines_.reserve(size); } template void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { if (lines_.size() == actual_) lines_.resize(lines_.size() + 1); // TODO avoid this copy. std::string &copied = lines_[actual_].line; copied.assign(line.data(), line.size()); lines_[actual_].ngram.set(copied.data() + (ngram.data() - line.data()), ngram.size()); ++actual_; } template void CallFilter(Filter &filter, Output &output) const { for (std::vector::const_iterator i = lines_.begin(); i != lines_.begin() + actual_; ++i) { filter.AddNGram(i->ngram, i->line, output); } } void Clear() { actual_ = 0; } bool Empty() { return actual_ == 0; } size_t Size() { return actual_; } private: struct Line { std::string line; StringPiece ngram; }; size_t actual_; std::vector lines_; }; class BinaryOutputBuffer { public: BinaryOutputBuffer() {} void Reserve(size_t size) { lines_.reserve(size); } void AddNGram(const StringPiece &line) { lines_.push_back(line); } template void Flush(Output &output) { for (std::vector::const_iterator i = lines_.begin(); i != lines_.end(); ++i) { output.AddNGram(*i); } lines_.clear(); } private: std::vector lines_; }; class MultipleOutputBuffer { public: MultipleOutputBuffer() : last_(NULL) {} void Reserve(size_t size) { annotated_.reserve(size); } void AddNGram(const StringPiece &line) { annotated_.resize(annotated_.size() + 1); annotated_.back().line = line; } void SingleAddNGram(size_t offset, const StringPiece &line) { if ((line.data() == last_.data()) && (line.length() == last_.length())) { annotated_.back().systems.push_back(offset); } else { annotated_.resize(annotated_.size() + 1); annotated_.back().systems.push_back(offset); annotated_.back().line = line; last_ = line; } } template void Flush(Output &output) { for (std::vector::const_iterator i = annotated_.begin(); i != annotated_.end(); ++i) { if (i->systems.empty()) { output.AddNGram(i->line); } else { for (std::vector::const_iterator j = i->systems.begin(); j != i->systems.end(); ++j) { output.SingleAddNGram(*j, i->line); } } } annotated_.clear(); } private: struct Annotated { // If this is empty, send to all systems. // A filter should never send to all systems and send to a single one. std::vector systems; StringPiece line; }; StringPiece last_; std::vector annotated_; }; } // namespace lm #endif // LM_FILTER_FORMAT_H