| | #include "CderScorer.h" |
| |
|
| | #include <algorithm> |
| | #include <fstream> |
| | #include <stdexcept> |
| |
|
| | using namespace std; |
| |
|
| | namespace |
| | { |
| |
|
| | inline int CalcDistance(int word1, int word2) |
| | { |
| | return word1 == word2 ? 0 : 1; |
| | } |
| |
|
| | } |
| |
|
| | namespace MosesTuning |
| | { |
| |
|
| |
|
| | CderScorer::CderScorer(const string& config, bool allowed_long_jumps) |
| | : StatisticsBasedScorer(allowed_long_jumps ? "CDER" : "WER", config), |
| | m_allowed_long_jumps(allowed_long_jumps) {} |
| |
|
| | CderScorer::~CderScorer() {} |
| |
|
| | void CderScorer::setReferenceFiles(const vector<string>& referenceFiles) |
| | { |
| | |
| | m_ref_sentences.clear(); |
| |
|
| | |
| | for (size_t rid = 0; rid < referenceFiles.size(); ++rid) { |
| | ifstream refin(referenceFiles[rid].c_str()); |
| | if (!refin) { |
| | throw runtime_error("Unable to open: " + referenceFiles[rid]); |
| | } |
| | m_ref_sentences.push_back(vector<sent_t>()); |
| | string line; |
| | while (getline(refin,line)) { |
| | line = this->preprocessSentence(line); |
| | sent_t encoded; |
| | TokenizeAndEncode(line, encoded); |
| | m_ref_sentences[rid].push_back(encoded); |
| | } |
| | } |
| | } |
| |
|
| | void CderScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) |
| | { |
| | string sentence = this->preprocessSentence(text); |
| |
|
| | vector<ScoreStatsType> stats; |
| | prepareStatsVector(sid, sentence, stats); |
| | entry.set(stats); |
| | } |
| |
|
| | void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<ScoreStatsType>& stats) |
| | { |
| | sent_t cand; |
| | TokenizeAndEncode(text, cand); |
| |
|
| | float max = -2; |
| | vector<ScoreStatsType> tmp; |
| | for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) { |
| | const sent_t& ref = m_ref_sentences[rid][sid]; |
| | tmp.clear(); |
| | computeCD(cand, ref, tmp); |
| | int score = calculateScore(tmp); |
| | if (rid == 0) { |
| | stats = tmp; |
| | max = score; |
| | } else if (score > max) { |
| | stats = tmp; |
| | max = score; |
| | } |
| | } |
| | } |
| |
|
| | float CderScorer::calculateScore(const vector<ScoreStatsType>& comps) const |
| | { |
| | if (comps.size() != 2) { |
| | throw runtime_error("Size of stat vector for CDER is not 2"); |
| | } |
| | if (comps[1] == 0) return 1.0f; |
| | return 1.0f - (comps[0] / static_cast<float>(comps[1])); |
| | } |
| |
|
| | void CderScorer::computeCD(const sent_t& cand, const sent_t& ref, |
| | vector<ScoreStatsType>& stats) const |
| | { |
| | int I = cand.size() + 1; |
| | int L = ref.size() + 1; |
| |
|
| | int l = 0; |
| | |
| | vector<int>* row = new vector<int>(I); |
| |
|
| | |
| | for (int i = 0; i < I; ++i) (*row)[i] = i; |
| |
|
| | |
| | if (m_allowed_long_jumps) { |
| | for (int i = 1; i < I; ++i) (*row)[i] = 1; |
| | } |
| |
|
| | |
| | while (++l < L) { |
| | vector<int>* nextRow = new vector<int>(I); |
| | for (int i = 0; i < I; ++i) { |
| | vector<int> possibleCosts; |
| | if (i > 0) { |
| | possibleCosts.push_back((*nextRow)[i-1] + 1); |
| | possibleCosts.push_back((*row)[i-1] + CalcDistance(ref[l-1], cand[i-1])); |
| | } |
| | possibleCosts.push_back((*row)[i] + 1); |
| | (*nextRow)[i] = *min_element(possibleCosts.begin(), possibleCosts.end()); |
| | } |
| |
|
| | if (m_allowed_long_jumps) { |
| | |
| | int LJ = 1 + *min_element(nextRow->begin(), nextRow->end()); |
| |
|
| | for (int i = 0; i < I; ++i) { |
| | (*nextRow)[i] = min((*nextRow)[i], LJ); |
| | } |
| | } |
| |
|
| | delete row; |
| | row = nextRow; |
| | } |
| |
|
| | stats.resize(2); |
| | stats[0] = *(row->rbegin()); |
| | stats[1] = ref.size(); |
| |
|
| | delete row; |
| | } |
| |
|
| | } |
| |
|