|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <cmath> |
|
#include <limits> |
|
#include <list> |
|
|
|
#include <boost/unordered_set.hpp> |
|
|
|
#include "util/file_piece.hh" |
|
#include "util/tokenize_piece.hh" |
|
|
|
#include "BleuScorer.h" |
|
#include "ForestRescore.h" |
|
|
|
using namespace std; |
|
|
|
namespace MosesTuning |
|
{ |
|
|
|
std::ostream& operator<<(std::ostream& out, const WordVec& wordVec) |
|
{ |
|
out << "["; |
|
for (size_t i = 0; i < wordVec.size(); ++i) { |
|
out << wordVec[i]->first; |
|
if (i+1< wordVec.size()) out << " "; |
|
} |
|
out << "]"; |
|
return out; |
|
} |
|
|
|
|
|
void ReferenceSet::Load(const vector<string>& files, Vocab& vocab) |
|
{ |
|
for (size_t i = 0; i < files.size(); ++i) { |
|
util::FilePiece fh(files[i].c_str()); |
|
size_t sentenceId = 0; |
|
while(true) { |
|
StringPiece line; |
|
try { |
|
line = fh.ReadLine(); |
|
} catch (util::EndOfFileException &e) { |
|
break; |
|
} |
|
AddLine(sentenceId, line, vocab); |
|
++sentenceId; |
|
} |
|
} |
|
|
|
} |
|
|
|
void ReferenceSet::AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab) |
|
{ |
|
|
|
NgramCounter ngramCounts; |
|
list<WordVec> openNgrams; |
|
size_t length = 0; |
|
|
|
for (util::TokenIter<util::SingleCharacter, true> j(line, util::SingleCharacter(' ')); j; ++j) { |
|
const Vocab::Entry* nextTok = &(vocab.FindOrAdd(*j)); |
|
++length; |
|
openNgrams.push_front(WordVec()); |
|
for (list<WordVec>::iterator k = openNgrams.begin(); k != openNgrams.end(); ++k) { |
|
k->push_back(nextTok); |
|
++ngramCounts[*k]; |
|
} |
|
if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back(); |
|
} |
|
|
|
|
|
for (NgramCounter::const_iterator ni = ngramCounts.begin(); |
|
ni != ngramCounts.end(); ++ni) { |
|
size_t count = ni->second; |
|
|
|
if (ngramCounts_.size() <= sentenceId) ngramCounts_.resize(sentenceId+1); |
|
NgramMap::iterator totalsIter = ngramCounts_[sentenceId].find(ni->first); |
|
if (totalsIter == ngramCounts_[sentenceId].end()) { |
|
ngramCounts_[sentenceId][ni->first] = pair<size_t,size_t>(count,count); |
|
} else { |
|
ngramCounts_[sentenceId][ni->first].first = max(count, ngramCounts_[sentenceId][ni->first].first); |
|
ngramCounts_[sentenceId][ni->first].second += count; |
|
} |
|
} |
|
|
|
if (lengths_.size() <= sentenceId) lengths_.resize(sentenceId+1); |
|
|
|
if (!lengths_[sentenceId]) { |
|
lengths_[sentenceId] = length; |
|
} else { |
|
lengths_[sentenceId] = min(length,lengths_[sentenceId]); |
|
} |
|
|
|
|
|
} |
|
|
|
size_t ReferenceSet::NgramMatches(size_t sentenceId, const WordVec& ngram, bool clip) const |
|
{ |
|
const NgramMap& ngramCounts = ngramCounts_.at(sentenceId); |
|
NgramMap::const_iterator ngi = ngramCounts.find(ngram); |
|
if (ngi == ngramCounts.end()) return 0; |
|
return clip ? ngi->second.first : ngi->second.second; |
|
} |
|
|
|
VertexState::VertexState(): bleuStats(kBleuNgramOrder), targetLength(0) {} |
|
|
|
void HgBleuScorer::UpdateMatches(const NgramCounter& counts, vector<FeatureStatsType>& bleuStats ) const |
|
{ |
|
for (NgramCounter::const_iterator ngi = counts.begin(); ngi != counts.end(); ++ngi) { |
|
|
|
size_t order = ngi->first.size(); |
|
size_t count = ngi->second; |
|
bleuStats[(order-1)*2 + 1] += count; |
|
bleuStats[(order-1) * 2] += min(count, references_.NgramMatches(sentenceId_,ngi->first,false)); |
|
} |
|
} |
|
|
|
size_t HgBleuScorer::GetTargetLength(const Edge& edge) const |
|
{ |
|
size_t targetLength = 0; |
|
for (size_t i = 0; i < edge.Words().size(); ++i) { |
|
const Vocab::Entry* word = edge.Words()[i]; |
|
if (word) ++targetLength; |
|
} |
|
for (size_t i = 0; i < edge.Children().size(); ++i) { |
|
const VertexState& state = vertexStates_[edge.Children()[i]]; |
|
targetLength += state.targetLength; |
|
} |
|
return targetLength; |
|
} |
|
|
|
FeatureStatsType HgBleuScorer::Score(const Edge& edge, const Vertex& head, vector<FeatureStatsType>& bleuStats) |
|
{ |
|
NgramCounter ngramCounts; |
|
size_t childId = 0; |
|
size_t wordId = 0; |
|
size_t contextId = 0; |
|
const VertexState* vertexState = NULL; |
|
bool inLeftContext = false; |
|
bool inRightContext = false; |
|
list<WordVec> openNgrams; |
|
const Vocab::Entry* currentWord = NULL; |
|
while (wordId < edge.Words().size()) { |
|
currentWord = edge.Words()[wordId]; |
|
if (currentWord != NULL) { |
|
++wordId; |
|
} else { |
|
if (!inLeftContext && !inRightContext) { |
|
|
|
assert(!vertexState); |
|
vertexState = &(vertexStates_[edge.Children()[childId]]); |
|
++childId; |
|
if (vertexState->leftContext.size()) { |
|
inLeftContext = true; |
|
contextId = 0; |
|
currentWord = vertexState->leftContext[contextId]; |
|
} else { |
|
|
|
vertexState = NULL; |
|
++wordId; |
|
continue; |
|
} |
|
} else { |
|
|
|
++contextId; |
|
if (inLeftContext && contextId < vertexState->leftContext.size()) { |
|
|
|
currentWord = vertexState->leftContext[contextId]; |
|
} else if (inLeftContext) { |
|
|
|
if (vertexState->leftContext.size() == kBleuNgramOrder-1) { |
|
|
|
openNgrams.clear(); |
|
inLeftContext = false; |
|
inRightContext = true; |
|
contextId = 0; |
|
currentWord = vertexState->rightContext[contextId]; |
|
} else { |
|
|
|
inLeftContext = false; |
|
vertexState = NULL; |
|
++wordId; |
|
continue; |
|
} |
|
} else { |
|
|
|
if (contextId < vertexState->rightContext.size()) { |
|
currentWord = vertexState->rightContext[contextId]; |
|
} else { |
|
|
|
inRightContext = false; |
|
vertexState = NULL; |
|
++wordId; |
|
continue; |
|
} |
|
} |
|
} |
|
} |
|
assert(currentWord); |
|
if (graph_.IsBoundary(currentWord)) continue; |
|
openNgrams.push_front(WordVec()); |
|
openNgrams.front().reserve(kBleuNgramOrder); |
|
for (list<WordVec>::iterator k = openNgrams.begin(); k != openNgrams.end(); ++k) { |
|
k->push_back(currentWord); |
|
|
|
if (!vertexState || (inLeftContext && k->size() > contextId+1)) ++ngramCounts[*k]; |
|
} |
|
if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back(); |
|
} |
|
|
|
|
|
|
|
|
|
UpdateMatches(ngramCounts, bleuStats); |
|
|
|
|
|
for (size_t i = 0; i < edge.Children().size(); ++i) { |
|
|
|
for (size_t j = 0; j < bleuStats.size(); ++j) { |
|
bleuStats[j] += vertexStates_[edge.Children()[i]].bleuStats[j]; |
|
} |
|
} |
|
|
|
|
|
FeatureStatsType sourceLength = head.SourceCovered(); |
|
size_t referenceLength = references_.Length(sentenceId_); |
|
FeatureStatsType effectiveReferenceLength = |
|
sourceLength / totalSourceLength_ * referenceLength; |
|
|
|
bleuStats[bleuStats.size()-1] = effectiveReferenceLength; |
|
|
|
|
|
FeatureStatsType bleu = sentenceLevelBackgroundBleu(bleuStats, backgroundBleu_); |
|
|
|
return bleu; |
|
} |
|
|
|
void HgBleuScorer::UpdateState(const Edge& winnerEdge, size_t vertexId, const vector<FeatureStatsType>& bleuStats) |
|
{ |
|
|
|
VertexState& vertexState = vertexStates_[vertexId]; |
|
|
|
|
|
|
|
int wi = 0; |
|
const VertexState* childState = NULL; |
|
int contexti = 0; |
|
int childi = 0; |
|
while (vertexState.leftContext.size() < (kBleuNgramOrder-1)) { |
|
if ((size_t)wi >= winnerEdge.Words().size()) break; |
|
const Vocab::Entry* word = winnerEdge.Words()[wi]; |
|
if (word != NULL) { |
|
vertexState.leftContext.push_back(word); |
|
++wi; |
|
} else { |
|
if (childState == NULL) { |
|
|
|
childState = &(vertexStates_[winnerEdge.Children()[childi++]]); |
|
contexti = 0; |
|
} |
|
if ((size_t)contexti < childState->leftContext.size()) { |
|
vertexState.leftContext.push_back(childState->leftContext[contexti++]); |
|
} else { |
|
|
|
childState = NULL; |
|
++wi; |
|
} |
|
} |
|
} |
|
|
|
|
|
wi = winnerEdge.Words().size() - 1; |
|
childState = NULL; |
|
childi = winnerEdge.Children().size() - 1; |
|
while (vertexState.rightContext.size() < (kBleuNgramOrder-1)) { |
|
if (wi < 0) break; |
|
const Vocab::Entry* word = winnerEdge.Words()[wi]; |
|
if (word != NULL) { |
|
vertexState.rightContext.push_back(word); |
|
--wi; |
|
} else { |
|
if (childState == NULL) { |
|
|
|
childState = &(vertexStates_[winnerEdge.Children()[childi--]]); |
|
contexti = childState->rightContext.size()-1; |
|
} |
|
if (contexti >= 0) { |
|
vertexState.rightContext.push_back(childState->rightContext[contexti--]); |
|
} else { |
|
|
|
childState = NULL; |
|
--wi; |
|
} |
|
} |
|
} |
|
reverse(vertexState.rightContext.begin(), vertexState.rightContext.end()); |
|
|
|
|
|
vertexState.targetLength = GetTargetLength(winnerEdge); |
|
vertexState.bleuStats = bleuStats; |
|
} |
|
|
|
|
|
typedef pair<const Edge*,FeatureStatsType> BackPointer; |
|
|
|
|
|
|
|
|
|
|
|
static void GetBestHypothesis(size_t vertexId, const Graph& graph, const vector<BackPointer>& bps, |
|
HgHypothesis* bestHypo) |
|
{ |
|
|
|
|
|
if (!bps[vertexId].first) return; |
|
const Edge* prevEdge = bps[vertexId].first; |
|
bestHypo->featureVector += *(prevEdge->Features().get()); |
|
size_t childId = 0; |
|
for (size_t i = 0; i < prevEdge->Words().size(); ++i) { |
|
if (prevEdge->Words()[i] != NULL) { |
|
bestHypo->text.push_back(prevEdge->Words()[i]); |
|
} else { |
|
size_t childVertexId = prevEdge->Children()[childId++]; |
|
HgHypothesis childHypo; |
|
GetBestHypothesis(childVertexId,graph,bps,&childHypo); |
|
bestHypo->text.insert(bestHypo->text.end(), childHypo.text.begin(), childHypo.text.end()); |
|
bestHypo->featureVector += childHypo.featureVector; |
|
} |
|
} |
|
} |
|
|
|
void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, const ReferenceSet& references , size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu, HgHypothesis* bestHypo) |
|
{ |
|
BackPointer init((const Edge*) NULL,kMinScore); |
|
vector<BackPointer> backPointers(graph.VertexSize(),init); |
|
HgBleuScorer bleuScorer(references, graph, sentenceId, backgroundBleu); |
|
vector<FeatureStatsType> winnerStats(kBleuNgramOrder*2+1); |
|
for (size_t vi = 0; vi < graph.VertexSize(); ++vi) { |
|
|
|
FeatureStatsType winnerScore = kMinScore; |
|
const Vertex& vertex = graph.GetVertex(vi); |
|
const vector<const Edge*>& incoming = vertex.GetIncoming(); |
|
if (!incoming.size()) { |
|
|
|
|
|
backPointers[vi].first = NULL; |
|
backPointers[vi].second = kMinScore; |
|
} else { |
|
|
|
for (size_t ei = 0; ei < incoming.size(); ++ei) { |
|
|
|
FeatureStatsType incomingScore = incoming[ei]->GetScore(weights); |
|
for (size_t i = 0; i < incoming[ei]->Children().size(); ++i) { |
|
size_t childId = incoming[ei]->Children()[i]; |
|
|
|
|
|
incomingScore = max(incomingScore + backPointers[childId].second, kMinScore); |
|
} |
|
vector<FeatureStatsType> bleuStats(kBleuNgramOrder*2+1); |
|
|
|
|
|
FeatureStatsType totalScore = incomingScore; |
|
if (bleuWeight) { |
|
FeatureStatsType bleuScore = bleuScorer.Score(*(incoming[ei]), vertex, bleuStats); |
|
if (isnan(bleuScore)) { |
|
cerr << "WARN: bleu score undefined" << endl; |
|
cerr << "\tVertex id : " << vi << endl; |
|
cerr << "\tBleu stats : "; |
|
for (size_t i = 0; i < bleuStats.size(); ++i) { |
|
cerr << bleuStats[i] << ","; |
|
} |
|
cerr << endl; |
|
bleuScore = 0; |
|
} |
|
|
|
totalScore += bleuWeight * bleuScore; |
|
|
|
|
|
} |
|
if (totalScore >= winnerScore) { |
|
|
|
|
|
winnerScore = totalScore; |
|
backPointers[vi].first = incoming[ei]; |
|
backPointers[vi].second = incomingScore; |
|
winnerStats = bleuStats; |
|
} |
|
} |
|
|
|
|
|
|
|
if (backPointers[vi].first) { |
|
bleuScorer.UpdateState(*(backPointers[vi].first), vi, winnerStats); |
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
GetBestHypothesis(graph.VertexSize()-1, graph, backPointers, bestHypo); |
|
|
|
|
|
|
|
|
|
|
|
bestHypo->bleuStats.resize(kBleuNgramOrder*2+1); |
|
NgramCounter counts; |
|
list<WordVec> openNgrams; |
|
for (size_t i = 0; i < bestHypo->text.size(); ++i) { |
|
const Vocab::Entry* entry = bestHypo->text[i]; |
|
if (graph.IsBoundary(entry)) continue; |
|
openNgrams.push_front(WordVec()); |
|
for (list<WordVec>::iterator k = openNgrams.begin(); k != openNgrams.end(); ++k) { |
|
k->push_back(entry); |
|
++counts[*k]; |
|
} |
|
if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back(); |
|
} |
|
for (NgramCounter::const_iterator ngi = counts.begin(); ngi != counts.end(); ++ngi) { |
|
size_t order = ngi->first.size(); |
|
size_t count = ngi->second; |
|
bestHypo->bleuStats[(order-1)*2 + 1] += count; |
|
bestHypo->bleuStats[(order-1) * 2] += min(count, references.NgramMatches(sentenceId,ngi->first,true)); |
|
} |
|
bestHypo->bleuStats[kBleuNgramOrder*2] = references.Length(sentenceId); |
|
} |
|
|
|
|
|
}; |
|
|