#ifndef LM_BUILDER_SORT_H #define LM_BUILDER_SORT_H #include "lm/builder/ngram_stream.hh" #include "lm/builder/ngram.hh" #include "lm/word_index.hh" #include "util/stream/sort.hh" #include "util/stream/timer.hh" #include #include namespace lm { namespace builder { /** * Abstract parent class for defining custom n-gram comparators. */ template class Comparator : public std::binary_function { public: /** * Constructs a comparator capable of comparing two n-grams. * * @param order Number of words in each n-gram */ explicit Comparator(std::size_t order) : order_(order) {} /** * Applies the comparator using the Compare method that must be defined in any class that inherits from this class. * * @param lhs A pointer to the n-gram on the left-hand side of the comparison * @param rhs A pointer to the n-gram on the right-hand side of the comparison * * @see ContextOrder::Compare * @see PrefixOrder::Compare * @see SuffixOrder::Compare */ inline bool operator()(const void *lhs, const void *rhs) const { return static_cast(this)->Compare(static_cast(lhs), static_cast(rhs)); } /** Gets the n-gram order defined for this comparator. */ std::size_t Order() const { return order_; } protected: std::size_t order_; }; /** * N-gram comparator that compares n-grams according to their reverse (suffix) order. * * This comparator compares n-grams lexicographically, one word at a time, * beginning with the last word of each n-gram and ending with the first word of each n-gram. * * Some examples of n-gram comparisons as defined by this comparator: * - a b c == a b c * - a b c < a b d * - a b c > a d b * - a b c > a b b * - a b c > x a c * - a b c < x y z */ class SuffixOrder : public Comparator { public: /** * Constructs a comparator capable of comparing two n-grams. * * @param order Number of words in each n-gram */ explicit SuffixOrder(std::size_t order) : Comparator(order) {} /** * Compares two n-grams lexicographically, one word at a time, * beginning with the last word of each n-gram and ending with the first word of each n-gram. * * @param lhs A pointer to the n-gram on the left-hand side of the comparison * @param rhs A pointer to the n-gram on the right-hand side of the comparison */ inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { for (std::size_t i = order_ - 1; i != 0; --i) { if (lhs[i] != rhs[i]) return lhs[i] < rhs[i]; } return lhs[0] < rhs[0]; } static const unsigned kMatchOffset = 1; }; /** * N-gram comparator that compares n-grams according to the reverse (suffix) order of the n-gram context. * * This comparator compares n-grams lexicographically, one word at a time, * beginning with the penultimate word of each n-gram and ending with the first word of each n-gram; * finally, this comparator compares the last word of each n-gram. * * Some examples of n-gram comparisons as defined by this comparator: * - a b c == a b c * - a b c < a b d * - a b c < a d b * - a b c > a b b * - a b c > x a c * - a b c < x y z */ class ContextOrder : public Comparator { public: /** * Constructs a comparator capable of comparing two n-grams. * * @param order Number of words in each n-gram */ explicit ContextOrder(std::size_t order) : Comparator(order) {} /** * Compares two n-grams lexicographically, one word at a time, * beginning with the penultimate word of each n-gram and ending with the first word of each n-gram; * finally, this comparator compares the last word of each n-gram. * * @param lhs A pointer to the n-gram on the left-hand side of the comparison * @param rhs A pointer to the n-gram on the right-hand side of the comparison */ inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { for (int i = order_ - 2; i >= 0; --i) { if (lhs[i] != rhs[i]) return lhs[i] < rhs[i]; } return lhs[order_ - 1] < rhs[order_ - 1]; } }; /** * N-gram comparator that compares n-grams according to their natural (prefix) order. * * This comparator compares n-grams lexicographically, one word at a time, * beginning with the first word of each n-gram and ending with the last word of each n-gram. * * Some examples of n-gram comparisons as defined by this comparator: * - a b c == a b c * - a b c < a b d * - a b c < a d b * - a b c > a b b * - a b c < x a c * - a b c < x y z */ class PrefixOrder : public Comparator { public: /** * Constructs a comparator capable of comparing two n-grams. * * @param order Number of words in each n-gram */ explicit PrefixOrder(std::size_t order) : Comparator(order) {} /** * Compares two n-grams lexicographically, one word at a time, * beginning with the first word of each n-gram and ending with the last word of each n-gram. * * @param lhs A pointer to the n-gram on the left-hand side of the comparison * @param rhs A pointer to the n-gram on the right-hand side of the comparison */ inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { for (std::size_t i = 0; i < order_; ++i) { if (lhs[i] != rhs[i]) return lhs[i] < rhs[i]; } return false; } static const unsigned kMatchOffset = 0; }; // Sum counts for the same n-gram. struct AddCombiner { bool operator()(void *first_void, const void *second_void, const SuffixOrder &compare) const { NGram first(first_void, compare.Order()); // There isn't a const version of NGram. NGram second(const_cast(second_void), compare.Order()); if (memcmp(first.begin(), second.begin(), sizeof(WordIndex) * compare.Order())) return false; first.Count() += second.Count(); return true; } }; // The combiner is only used on a single chain, so I didn't bother to allow // that template. /** * Represents an @ref util::FixedArray "array" capable of storing @ref util::stream::Sort "Sort" objects. * * In the anticipated use case, an instance of this class will maintain one @ref util::stream::Sort "Sort" object * for each n-gram order (ranging from 1 up to the maximum n-gram order being processed). * Use in this manner would enable the n-grams each n-gram order to be sorted, in parallel. * * @tparam Compare An @ref Comparator "ngram comparator" to use during sorting. */ template class Sorts : public util::FixedArray > { private: typedef util::stream::Sort S; typedef util::FixedArray P; public: /** * Constructs, but does not initialize. * * @ref util::FixedArray::Init() "Init" must be called before use. * * @see util::FixedArray::Init() */ Sorts() {} /** * Constructs an @ref util::FixedArray "array" capable of storing a fixed number of @ref util::stream::Sort "Sort" objects. * * @param number The maximum number of @ref util::stream::Sort "sorters" that can be held by this @ref util::FixedArray "array" * @see util::FixedArray::FixedArray() */ explicit Sorts(std::size_t number) : util::FixedArray >(number) {} /** * Constructs a new @ref util::stream::Sort "Sort" object which is stored in this @ref util::FixedArray "array". * * The new @ref util::stream::Sort "Sort" object is constructed using the provided @ref util::stream::SortConfig "SortConfig" and @ref Comparator "ngram comparator"; * once constructed, a new worker @ref util::stream::Thread "thread" (owned by the @ref util::stream::Chain "chain") will sort the n-gram data stored * in the @ref util::stream::Block "blocks" of the provided @ref util::stream::Chain "chain". * * @see util::stream::Sort::Sort() * @see util::stream::Chain::operator>>() */ void push_back(util::stream::Chain &chain, const util::stream::SortConfig &config, const Compare &compare) { new (P::end()) S(chain, config, compare); // use "placement new" syntax to initalize S in an already-allocated memory location P::Constructed(); } }; } // namespace builder } // namespace lm #endif // LM_BUILDER_SORT_H