File size: 5,950 Bytes
1ce325b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#include "tune_derivatives.hh"

#include "tune_instances.hh"
#include "tune_matrix.hh"
#include "../../util/stream/chain.hh"
#include "../../util/stream/typed_stream.hh"

#include <Eigen/Core>

namespace lm { namespace interpolate {

Accum Derivatives(Instances &in, const Vector &weights, Vector &gradient, Matrix &hessian) {
  gradient = in.CorrectGradientTerm();
  hessian = Matrix::Zero(weights.rows(), weights.rows());

  // TODO: loop instead to force low-memory evaluation?
  // Compute p_I(x)*Z_{\epsilon} i.e. the unnormalized probabilities
  Vector weighted_uni((in.LNUnigrams() * weights).array().exp());
  // Even -inf doesn't work for <s> because weights can be negative.  Manually set it to zero.
  weighted_uni(in.BOS()) = 0.0;
  Accum Z_epsilon = weighted_uni.sum();
  // unigram_cross(i) = \sum_{all x} p_I(x) ln p_i(x)
  Vector unigram_cross(in.LNUnigrams().transpose() * weighted_uni / Z_epsilon);

  Accum sum_B_I = 0.0;
  Accum sum_ln_Z_context = 0.0;

  // Temporaries used each cycle of the loop.
  Matrix convolve;
  Vector full_cross;
  Matrix hessian_missing_Z_context;
  // Backed off ln p_i(x)B_i(context)
  Vector ln_p_i_backed;
  // Full ln p_i(x | context)
  Vector ln_p_i_full;

  // TODO make configurable memory size.
  util::stream::Chain chain(util::stream::ChainConfig(in.ReadExtensionsEntrySize(), 2, 64 << 20));
  chain.ActivateProgress();
  in.ReadExtensions(chain);
  util::stream::TypedStream<Extension> extensions(chain.Add());
  chain >> util::stream::kRecycle;

  // Loop over instances (words in the tuning data).
  for (InstanceIndex n = 0; n < in.NumInstances(); ++n) {
    assert(extensions);
    Accum weighted_backoffs = exp(in.LNBackoffs(n).dot(weights));

    // Compute \sum_{x: model does not back off to unigram} p_I(x)Z(epsilon)
    Accum unnormalized_sum_x_p_I = 0.0;
    // Compute \sum_{x: model does not back off to unigram} p_I(x | context)Z(context)
    Accum unnormalized_sum_x_p_I_full = 0.0;

    // This should be divided by Z_context then added to the Hessian.
    hessian_missing_Z_context = Matrix::Zero(weights.rows(), weights.rows());

    full_cross = Vector::Zero(weights.rows());

    // Loop over words within an instance for which extension exists.  An extension happens when any model matches more than a unigram in the tuning instance.
    while (extensions && extensions->instance == n) {
      const WordIndex word = extensions->word;
      unnormalized_sum_x_p_I += weighted_uni(word);

      ln_p_i_backed = in.LNUnigrams().row(word) + in.LNBackoffs(n);

      // Calculate ln_p_i_full(i) = ln p_i(word | context) by filling in unigrams then overwriting with extensions.
      ln_p_i_full = ln_p_i_backed;
      // Loop over all models that have an extension for the same word namely p_i(word | context) matches at least a bigram.
      for (; extensions && extensions->word == word && extensions->instance == n; ++extensions) {
        ln_p_i_full(extensions->model) = extensions->ln_prob;
      }

      // This is the weighted product of probabilities.  In other words, p_I(word | context) * Z(context) = exp(\sum_i w_i * p_i(word | context)).
      Accum weighted = exp(ln_p_i_full.dot(weights));
      unnormalized_sum_x_p_I_full += weighted;

      // These aren't normalized by Z_context (happens later)
      full_cross.noalias() +=
        weighted * ln_p_i_full
        - weighted_uni(word) * weighted_backoffs /* we'll divide by Z_context later to form B_I */ * in.LNUnigrams().row(word).transpose();

      // This will get multiplied by Z_context then added to the Hessian.
      hessian_missing_Z_context.noalias() +=
        // Replacement terms.
        weighted * ln_p_i_full * ln_p_i_full.transpose()
        // Presumed unigrams.  Z_epsilon * weighted_backoffs will turn into B_I once all of this is divided by Z_context.
        - weighted_uni(word) * weighted_backoffs * ln_p_i_backed * ln_p_i_backed.transpose();
    }

    Accum Z_context =
      weighted_backoffs * (Z_epsilon - unnormalized_sum_x_p_I) // Back off and unnormalize the unigrams for which there is no extension.
      + unnormalized_sum_x_p_I_full; // Add the extensions.
    sum_ln_Z_context += log(Z_context);
    Accum B_I = Z_epsilon / Z_context * weighted_backoffs;
    sum_B_I += B_I;

    // This is the gradient term for this instance except for -log p_i(w_n | w_1^{n-1}) which was accounted for as part of neg_correct_sum_.
    // full_cross(i) is \sum_{all x} p_I(x | context) log p_i(x | context)
    // Prior terms excluded dividing by Z_context because it wasn't known at the time.
    full_cross /= Z_context;
    full_cross +=
      // Uncorrected term
      B_I * (in.LNBackoffs(n).transpose() + unigram_cross)
      // Subtract values that should not have been charged.
      - unnormalized_sum_x_p_I / Z_epsilon * B_I * in.LNBackoffs(n).transpose();
    gradient += full_cross;

    convolve = unigram_cross * in.LNBackoffs(n);
    // There's one missing term here, which is independent of context and done at the end.
    hessian.noalias() +=
      // First term of Hessian, assuming all models back off to unigram.
      B_I * (convolve + convolve.transpose() + in.LNBackoffs(n).transpose() * in.LNBackoffs(n))
      // Error in the first term, correcting from unigram to full probabilities.
      + hessian_missing_Z_context / Z_context
      // Second term of Hessian, with correct full probabilities.
      - full_cross * full_cross.transpose();
  }

  for (Matrix::Index x = 0; x < weighted_uni.rows(); ++x) {
    // \sum_{contexts} B_I(context) \sum_x p_I(x) log p_i(x) log p_j(x)
    // TODO can this be optimized?  It's summing over the entire vocab which should be a matrix operation.
    hessian.noalias() += sum_B_I * weighted_uni(x) / Z_epsilon * in.LNUnigrams().row(x).transpose() * in.LNUnigrams().row(x);
  }
  return exp((in.CorrectGradientTerm().dot(weights) + sum_ln_Z_context) / static_cast<double>(in.NumInstances()));
}

}} // namespaces