| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #include <sstream> |
| | #include "moses/Util.h" |
| | #include "AlignedSentence.h" |
| | #include "Parameter.h" |
| |
|
| | using namespace std; |
| |
|
| |
|
| | |
| | AlignedSentence::AlignedSentence(int lineNum, |
| | const std::string &source, |
| | const std::string &target, |
| | const std::string &alignment) |
| | :m_lineNum(lineNum) |
| | { |
| | PopulateWordVec(m_source, source); |
| | PopulateWordVec(m_target, target); |
| | PopulateAlignment(alignment); |
| | } |
| |
|
| | AlignedSentence::~AlignedSentence() |
| | { |
| | Moses::RemoveAllInColl(m_source); |
| | Moses::RemoveAllInColl(m_target); |
| | } |
| |
|
| | void AlignedSentence::PopulateWordVec(Phrase &vec, const std::string &line) |
| | { |
| | std::vector<string> toks; |
| | Moses::Tokenize(toks, line); |
| |
|
| | vec.resize(toks.size()); |
| | for (size_t i = 0; i < vec.size(); ++i) { |
| | const string &tok = toks[i]; |
| | Word *word = new Word(i, tok); |
| | vec[i] = word; |
| | } |
| | } |
| |
|
| | void AlignedSentence::PopulateAlignment(const std::string &line) |
| | { |
| | vector<string> alignStr; |
| | Moses::Tokenize(alignStr, line); |
| |
|
| | for (size_t i = 0; i < alignStr.size(); ++i) { |
| | vector<int> alignPair; |
| | Moses::Tokenize(alignPair, alignStr[i], "-"); |
| | assert(alignPair.size() == 2); |
| |
|
| | int sourcePos = alignPair[0]; |
| | int targetPos = alignPair[1]; |
| |
|
| | if (sourcePos >= m_source.size()) { |
| | cerr << "ERROR1:AlignedSentence=" << Debug() << endl; |
| | cerr << "m_source=" << m_source.size() << endl; |
| | abort(); |
| | } |
| | assert(sourcePos < m_source.size()); |
| | assert(targetPos < m_target.size()); |
| | Word *sourceWord = m_source[sourcePos]; |
| | Word *targetWord = m_target[targetPos]; |
| |
|
| | sourceWord->AddAlignment(targetWord); |
| | targetWord->AddAlignment(sourceWord); |
| | } |
| | } |
| |
|
| | std::string AlignedSentence::Debug() const |
| | { |
| | stringstream out; |
| | out << "m_lineNum:"; |
| | out << m_lineNum; |
| | out << endl; |
| |
|
| | out << "m_source:"; |
| | out << m_source.Debug(); |
| | out << endl; |
| |
|
| | out << "m_target:"; |
| | out << m_target.Debug(); |
| | out << endl; |
| |
|
| | out << "consistent phrases:" << endl; |
| | out << m_consistentPhrases.Debug(); |
| | out << endl; |
| |
|
| | return out.str(); |
| | } |
| |
|
| | std::vector<int> AlignedSentence::GetSourceAlignmentCount() const |
| | { |
| | vector<int> ret(m_source.size()); |
| |
|
| | for (size_t i = 0; i < m_source.size(); ++i) { |
| | const Word &word = *m_source[i]; |
| | ret[i] = word.GetAlignmentIndex().size(); |
| | } |
| | return ret; |
| | } |
| |
|
| | void AlignedSentence::Create(const Parameter ¶ms) |
| | { |
| | CreateConsistentPhrases(params); |
| | m_consistentPhrases.AddHieroNonTerms(params); |
| | } |
| |
|
| | void AlignedSentence::CreateConsistentPhrases(const Parameter ¶ms) |
| | { |
| | int countT = m_target.size(); |
| | int countS = m_source.size(); |
| |
|
| | m_consistentPhrases.Initialize(countS); |
| |
|
| | |
| | for(int lengthT=1; |
| | lengthT <= params.maxSpan && lengthT <= countT; |
| | lengthT++) { |
| | for(int startT=0; startT < countT-(lengthT-1); startT++) { |
| |
|
| | |
| | int endT = startT + lengthT - 1; |
| |
|
| | |
| | |
| | int minS = 9999; |
| | int maxS = -1; |
| | vector< int > usedS = GetSourceAlignmentCount(); |
| | for(int ti=startT; ti<=endT; ti++) { |
| | const Word &word = *m_target[ti]; |
| | const std::set<int> &alignment = word.GetAlignmentIndex(); |
| |
|
| | std::set<int>::const_iterator iterAlign; |
| | for(iterAlign = alignment.begin(); iterAlign != alignment.end(); ++iterAlign) { |
| | int si = *iterAlign; |
| | if (si<minS) { |
| | minS = si; |
| | } |
| | if (si>maxS) { |
| | maxS = si; |
| | } |
| | usedS[ si ]--; |
| | } |
| | } |
| |
|
| | |
| | if( maxS == -1 ) |
| | continue; |
| |
|
| | |
| | size_t width = maxS - minS + 1; |
| |
|
| | if( width < params.minSpan ) |
| | continue; |
| |
|
| | if( width > params.maxSpan ) |
| | continue; |
| |
|
| | |
| | bool out_of_bounds = false; |
| | for(int si=minS; si<=maxS && !out_of_bounds; si++) |
| | if (usedS[si]>0) { |
| | out_of_bounds = true; |
| | } |
| |
|
| | |
| | if (out_of_bounds) |
| | continue; |
| |
|
| | |
| | |
| | for(int startS=minS; |
| | (startS>=0 && |
| | startS>maxS - params.maxSpan && |
| | (startS==minS || m_source[startS]->GetAlignment().size()==0)); |
| | startS--) { |
| | |
| | for(int endS=maxS; |
| | (endS<countS && endS<startS + params.maxSpan && |
| | (endS==maxS || m_source[endS]->GetAlignment().size()==0)); |
| | endS++) { |
| |
|
| | |
| | m_consistentPhrases.Add(startS, endS, startT, endT, params); |
| | } |
| | } |
| | } |
| | } |
| | } |
| |
|