|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef MERT_FOREST_RESCORE_H |
|
#define MERT_FOREST_RESCORE_H |
|
|
|
#include <valarray> |
|
#include <vector> |
|
|
|
#include <boost/unordered_set.hpp> |
|
|
|
#include "BleuScorer.h" |
|
#include "Hypergraph.h" |
|
|
|
namespace MosesTuning |
|
{ |
|
|
|
std::ostream& operator<<(std::ostream& out, const WordVec& wordVec); |
|
|
|
struct NgramHash : public std::unary_function<const WordVec&, std::size_t> { |
|
std::size_t operator()(const WordVec& ngram) const { |
|
return util::MurmurHashNative(&(ngram[0]), ngram.size() * sizeof(WordVec::value_type)); |
|
} |
|
}; |
|
|
|
struct NgramEquals : public std::binary_function<const WordVec&, const WordVec&, bool> { |
|
bool operator()(const WordVec& first, const WordVec& second) const { |
|
if (first.size() != second.size()) return false; |
|
return memcmp(&(first[0]), &(second[0]), first.size() * sizeof(WordVec::value_type)) == 0; |
|
} |
|
}; |
|
|
|
typedef boost::unordered_map<WordVec, size_t, NgramHash, NgramEquals> NgramCounter; |
|
|
|
|
|
class ReferenceSet |
|
{ |
|
|
|
|
|
public: |
|
|
|
void AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab); |
|
|
|
void Load(const std::vector<std::string>& files, Vocab& vocab); |
|
|
|
size_t NgramMatches(size_t sentenceId, const WordVec&, bool clip) const; |
|
|
|
size_t Length(size_t sentenceId) const { |
|
return lengths_[sentenceId]; |
|
} |
|
|
|
private: |
|
|
|
typedef boost::unordered_map<WordVec, std::pair<std::size_t,std::size_t>, NgramHash,NgramEquals> NgramMap; |
|
std::vector<NgramMap> ngramCounts_; |
|
std::vector<size_t> lengths_; |
|
|
|
}; |
|
|
|
struct VertexState { |
|
VertexState(); |
|
|
|
std::vector<FeatureStatsType> bleuStats; |
|
WordVec leftContext; |
|
WordVec rightContext; |
|
size_t targetLength; |
|
}; |
|
|
|
|
|
|
|
|
|
class HgBleuScorer |
|
{ |
|
public: |
|
HgBleuScorer(const ReferenceSet& references, const Graph& graph, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu): |
|
references_(references), sentenceId_(sentenceId), graph_(graph), backgroundBleu_(backgroundBleu), |
|
backgroundRefLength_(backgroundBleu[kBleuNgramOrder*2]) { |
|
vertexStates_.resize(graph.VertexSize()); |
|
totalSourceLength_ = graph.GetVertex(graph.VertexSize()-1).SourceCovered(); |
|
} |
|
|
|
FeatureStatsType Score(const Edge& edge, const Vertex& head, std::vector<FeatureStatsType>& bleuStats) ; |
|
|
|
void UpdateState(const Edge& winnerEdge, size_t vertexId, const std::vector<FeatureStatsType>& bleuStats); |
|
|
|
|
|
private: |
|
const ReferenceSet& references_; |
|
std::vector<VertexState> vertexStates_; |
|
size_t sentenceId_; |
|
size_t totalSourceLength_; |
|
const Graph& graph_; |
|
std::vector<FeatureStatsType> backgroundBleu_; |
|
FeatureStatsType backgroundRefLength_; |
|
|
|
void UpdateMatches(const NgramCounter& counter, std::vector<FeatureStatsType>& bleuStats) const; |
|
size_t GetTargetLength(const Edge& edge) const; |
|
}; |
|
|
|
struct HgHypothesis { |
|
SparseVector featureVector; |
|
WordVec text; |
|
std::vector<FeatureStatsType> bleuStats; |
|
}; |
|
|
|
void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, const ReferenceSet& references, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu, HgHypothesis* bestHypo); |
|
|
|
}; |
|
|
|
#endif |
|
|