#include "normalize.hh" #include "../common/compare.hh" #include "../common/ngram_stream.hh" #include "backoff_matrix.hh" #include "bounded_sequence_encoding.hh" #include "interpolate_info.hh" #include "merge_probabilities.hh" #include "../weights.hh" #include "../word_index.hh" #include "../../util/fixed_array.hh" #include "../../util/scoped.hh" #include "../../util/stream/stream.hh" #include "../../util/stream/rewindable_stream.hh" #include #include #include namespace lm { namespace interpolate { namespace { class BackoffQueueEntry { public: BackoffQueueEntry(float &entry, const util::stream::ChainPosition &position) : entry_(entry), stream_(position) { entry_ = 0.0; } operator bool() const { return stream_; } NGramHeader operator*() const { return *stream_; } const NGramHeader *operator->() const { return &*stream_; } void Enter() { entry_ = stream_->Value().backoff; } BackoffQueueEntry &Next() { entry_ = 0.0; ++stream_; return *this; } private: float &entry_; NGramStream stream_; }; struct PtrGreater : public std::binary_function { bool operator()(const BackoffQueueEntry *first, const BackoffQueueEntry *second) const { return SuffixLexicographicLess()(**second, **first); } }; class EntryOwner : public util::FixedArray { public: void push_back(float &entry, const util::stream::ChainPosition &position) { new (end()) BackoffQueueEntry(entry, position); Constructed(); } }; std::size_t MaxOrder(const util::FixedArray &model) { std::size_t ret = 0; for (const util::stream::ChainPositions *m = model.begin(); m != model.end(); ++m) { ret = std::max(ret, m->size()); } return ret; } class BackoffManager { public: explicit BackoffManager(const util::FixedArray &models) : entered_(MaxOrder(models)), matrix_(models.size(), MaxOrder(models)), skip_write_(MaxOrder(models)) { std::size_t total = 0; for (const util::stream::ChainPositions *m = models.begin(); m != models.end(); ++m) { total += m->size(); } for (std::size_t i = 0; i < MaxOrder(models); ++i) { entered_.push_back(models.size()); } owner_.Init(total); for (const util::stream::ChainPositions *m = models.begin(); m != models.end(); ++m) { for (const util::stream::ChainPosition *j = m->begin(); j != m->end(); ++j) { owner_.push_back(matrix_.Backoff(m - models.begin(), j - m->begin()), *j); if (owner_.back()) { queue_.push(&owner_.back()); } } } } void SetupSkip(std::size_t order, util::stream::Stream &stream) { skip_write_[order - 2] = &stream; } // Move up the backoffs for the given n-gram. The n-grams must be provided // in suffix lexicographic order. void Enter(const NGramHeader &to) { // Check that we exited properly. for (std::size_t i = to.Order() - 1; i < entered_.size(); ++i) { assert(entered_[i].empty()); } SuffixLexicographicLess less; while (!queue_.empty() && less(**queue_.top(), to)) SkipRecord(); while (TopMatches(to)) { BackoffQueueEntry *matches = queue_.top(); entered_[to.Order() - 1].push_back(matches); matches->Enter(); queue_.pop(); } } void Exit(std::size_t order_minus_1) { for (BackoffQueueEntry **i = entered_[order_minus_1].begin(); i != entered_[order_minus_1].end(); ++i) { if ((*i)->Next()) queue_.push(*i); } entered_[order_minus_1].clear(); } float Get(std::size_t model, std::size_t order_minus_1) const { return matrix_.Backoff(model, order_minus_1); } void Finish() { while (!queue_.empty()) SkipRecord(); } private: void SkipRecord() { BackoffQueueEntry *top = queue_.top(); queue_.pop(); // Is this the last instance of the n-gram? if (!TopMatches(**top)) { // An n-gram is being skipped. Called once per skipped n-gram, // regardless of how many models it comes from. *reinterpret_cast(skip_write_[(*top)->Order() - 1]->Get()) = 0.0; ++*skip_write_[(*top)->Order() - 1]; } if (top->Next()) queue_.push(top); } bool TopMatches(const NGramHeader &header) const { return !queue_.empty() && (*queue_.top())->Order() == header.Order() && std::equal(header.begin(), header.end(), (*queue_.top())->begin()); } EntryOwner owner_; std::priority_queue, PtrGreater> queue_; // Indexed by order then just all the matching models. util::FixedArray > entered_; BackoffMatrix matrix_; std::vector skip_write_; }; typedef long double Accum; // Handles n-grams of the same order, using recursion to call another instance // for higher orders. class Recurse { public: Recurse( const InterpolateInfo &info, // Must stay alive the entire time. std::size_t order, const util::stream::ChainPosition &merged_probs, const util::stream::ChainPosition &prob_out, const util::stream::ChainPosition &backoff_out, BackoffManager &backoffs, Recurse *higher) // higher is null for the highest order. : order_(order), encoding_(MakeEncoder(info, order)), input_(merged_probs, PartialProbGamma(order, encoding_.EncodedLength())), prob_out_(prob_out), backoff_out_(backoff_out), backoffs_(backoffs), lambdas_(&*info.lambdas.begin()), higher_(higher), decoded_backoffs_(info.Models()), extended_context_(order - 1) { // This is only for bigrams and above. Summing unigrams is a much easier case. assert(order >= 2); } // context = w_1^{n-1} // z_lower = Z(w_2^{n-1}) // Input: // Merged probabilities without backoff applied in input_. // Backoffs via backoffs_. // Calculates: // Z(w_1^{n-1}): intermediate only. // p_I(x | w_1^{n-1}) for all x: w_1^{n-1}x exists: Written to prob_out_. // b_I(w_1^{n-1}): Written to backoff_out_. void SameContext(const NGramHeader &context, Accum z_lower) { assert(context.size() == order_ - 1); backoffs_.Enter(context); prob_out_.Mark(); // This is the backoff term that applies when one assumes everything backs off: // \prod_i b_i(w_1^{n-1})^{\lambda_i}. Accum backoff_once = 0.0; for (std::size_t m = 0; m < decoded_backoffs_.size(); ++m) { backoff_once += lambdas_[m] * backoffs_.Get(m, order_ - 2); } Accum z_delta = 0.0; std::size_t count = 0; for (; input_ && std::equal(context.begin(), context.end(), input_->begin()); ++input_, ++prob_out_, ++count) { // Apply backoffs to probabilities. // TODO: change bounded sequence encoding to have an iterator for decoding instead of doing a copy here. encoding_.Decode(input_->FromBegin(), &*decoded_backoffs_.begin()); for (std::size_t m = 0; m < NumModels(); ++m) { // Apply the backoffs as instructed for model m. float accumulated = 0.0; // Change backoffs for [order it backed off to, order - 1) except // with 0-indexing. There is still the potential to charge backoff // for order - 1, which is done later. The backoffs charged here // are b_m(w_{n-1}^{n-1}) ... b_m(w_2^{n-1}) for (unsigned char backed_to = decoded_backoffs_[m]; backed_to < order_ - 2; ++backed_to) { accumulated += backoffs_.Get(m, backed_to); } float lambda = lambdas_[m]; // Lower p(x | w_2^{n-1}) gets all the backoffs except the highest. input_->LowerProb() += accumulated * lambda; // Charge the backoff b(w_1^{n-1}) if applicable, but only to attain p(x | w_1^{n-1}) if (decoded_backoffs_[m] < order_ - 1) { accumulated += backoffs_.Get(m, order_ - 2); } input_->Prob() += accumulated * lambda; } // TODO: better precision/less operations here. z_delta += pow(10.0, input_->Prob()) - pow(10.0, input_->LowerProb() + backoff_once); // Write unnormalized probability record. std::copy(input_->begin(), input_->end(), reinterpret_cast(prob_out_.Get())); ProbWrite() = input_->Prob(); } // TODO numerical precision. Accum z = log10(pow(10.0, z_lower + backoff_once) + z_delta); // Normalize. prob_out_.Rewind(); for (std::size_t i = 0; i < count; ++i, ++prob_out_) { ProbWrite() -= z; } // This allows the stream to release data. prob_out_.Mark(); // Output backoff. *reinterpret_cast(backoff_out_.Get()) = z_lower + backoff_once - z; ++backoff_out_; if (higher_.get()) higher_->ExtendContext(context, z); backoffs_.Exit(order_ - 2); } // Call is given a context and z(context). // Evaluates y context x for all y,x. void ExtendContext(const NGramHeader &middle, Accum z_lower) { assert(middle.size() == order_ - 2); // Copy because the input will advance. TODO avoid this copy by sharing amongst classes. std::copy(middle.begin(), middle.end(), extended_context_.begin() + 1); while (input_ && std::equal(middle.begin(), middle.end(), input_->begin() + 1)) { *extended_context_.begin() = *input_->begin(); SameContext(NGramHeader(&*extended_context_.begin(), order_ - 1), z_lower); } } void Finish() { assert(!input_); prob_out_.Poison(); backoff_out_.Poison(); if (higher_.get()) higher_->Finish(); } // The BackoffManager class also injects backoffs when it skips ahead e.g. b() = 1 util::stream::Stream &BackoffStream() { return backoff_out_; } private: // Write the probability to the correct place in prob_out_. Should use a proxy but currently incompatible with RewindableStream. float &ProbWrite() { return *reinterpret_cast(reinterpret_cast(prob_out_.Get()) + order_ * sizeof(WordIndex)); } std::size_t NumModels() const { return decoded_backoffs_.size(); } const std::size_t order_; const BoundedSequenceEncoding encoding_; ProxyStream input_; util::stream::RewindableStream prob_out_; util::stream::Stream backoff_out_; BackoffManager &backoffs_; const float *const lambdas_; // Higher order instance of this same class. util::scoped_ptr higher_; // Temporary in SameContext. std::vector decoded_backoffs_; // Temporary in ExtendContext. std::vector extended_context_; }; class Thread { public: Thread(const InterpolateInfo &info, util::FixedArray &models_by_order, util::stream::Chains &prob_out, util::stream::Chains &backoff_out) : info_(info), models_by_order_(models_by_order), prob_out_(prob_out), backoff_out_(backoff_out) {} void Run(const util::stream::ChainPositions &merged_probabilities) { // Unigrams do not have enocded backoff info. ProxyStream in(merged_probabilities[0], PartialProbGamma(1, 0)); util::stream::RewindableStream prob_write(prob_out_[0]); Accum z = 0.0; prob_write.Mark(); WordIndex count = 0; for (; in; ++in, ++prob_write, ++count) { // Note assumption that probabilitity comes first memcpy(prob_write.Get(), in.Get(), sizeof(WordIndex) + sizeof(float)); z += pow(10.0, in->Prob()); } // TODO HACK TODO: lmplz outputs p() = 1 to get q to compute nicely. That will always result in 1.0 more than it should be. z -= 1.0; float log_z = log10(z); prob_write.Rewind(); // Normalize unigram probabilities. for (WordIndex i = 0; i < count; ++i, ++prob_write) { *reinterpret_cast(reinterpret_cast(prob_write.Get()) + sizeof(WordIndex)) -= log_z; } prob_write.Poison(); // Now setup the higher orders. util::scoped_ptr higher_order; BackoffManager backoffs(models_by_order_); std::size_t max_order = merged_probabilities.size(); for (std::size_t order = max_order; order >= 2; --order) { higher_order.reset(new Recurse(info_, order, merged_probabilities[order - 1], prob_out_[order - 1], backoff_out_[order - 2], backoffs, higher_order.release())); backoffs.SetupSkip(order, higher_order->BackoffStream()); } if (max_order > 1) { higher_order->ExtendContext(NGramHeader(NULL, 0), log_z); backoffs.Finish(); higher_order->Finish(); } } private: const InterpolateInfo info_; util::FixedArray &models_by_order_; util::stream::ChainPositions prob_out_; util::stream::ChainPositions backoff_out_; }; } // namespace void Normalize(const InterpolateInfo &info, util::FixedArray &models_by_order, util::stream::Chains &merged_probabilities, util::stream::Chains &prob_out, util::stream::Chains &backoff_out) { assert(prob_out.size() == backoff_out.size() + 1); // Arbitrarily put the thread on the merged_probabilities Chains. merged_probabilities >> Thread(info, models_by_order, prob_out, backoff_out); } }} // namespaces