// Step of trie builder: create sorted files. #ifndef LM_TRIE_SORT_H #define LM_TRIE_SORT_H #include "max_order.hh" #include "word_index.hh" #include "../util/file.hh" #include "../util/scoped.hh" #include #include #include #include #include namespace util { class FilePiece; } // namespace util namespace lm { class PositiveProbWarn; namespace ngram { class SortedVocabulary; struct Config; namespace trie { class EntryCompare : public std::binary_function { public: explicit EntryCompare(unsigned char order) : order_(order) {} bool operator()(const void *first_void, const void *second_void) const { const WordIndex *first = static_cast(first_void); const WordIndex *second = static_cast(second_void); const WordIndex *end = first + order_; for (; first != end; ++first, ++second) { if (*first < *second) return true; if (*first > *second) return false; } return false; } private: unsigned char order_; }; class RecordReader { public: RecordReader() : remains_(true) {} void Init(FILE *file, std::size_t entry_size); void *Data() { return data_.get(); } const void *Data() const { return data_.get(); } RecordReader &operator++() { std::size_t ret = fread(data_.get(), entry_size_, 1, file_); if (!ret) { UTIL_THROW_IF(!feof(file_), util::ErrnoException, "Error reading temporary file"); remains_ = false; } return *this; } operator bool() const { return remains_; } void Rewind(); std::size_t EntrySize() const { return entry_size_; } void Overwrite(const void *start, std::size_t amount); private: FILE *file_; util::scoped_malloc data_; bool remains_; std::size_t entry_size_; }; class SortedFiles { public: // Build from ARPA SortedFiles(const Config &config, util::FilePiece &f, std::vector &counts, std::size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab); int StealUnigram() { return unigram_.release(); } FILE *Full(unsigned char order) { return full_[order - 2].get(); } FILE *Context(unsigned char of_order) { return context_[of_order - 2].get(); } private: void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector &counts, const std::string &prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size); util::scoped_fd unigram_; util::scoped_FILE full_[KENLM_MAX_ORDER - 1], context_[KENLM_MAX_ORDER - 1]; }; } // namespace trie } // namespace ngram } // namespace lm #endif // LM_TRIE_SORT_H