| | #include "dsgHyp.h" |
| | #include <sstream> |
| | #include <boost/algorithm/string.hpp> |
| | #include <algorithm> |
| | #include <cstdlib> |
| | #include <math.h> |
| | #include <map> |
| |
|
| |
|
| | using namespace std; |
| | using namespace lm::ngram; |
| |
|
| | namespace Moses |
| | { |
| | dsgState::dsgState(const State & val) |
| | { |
| | lmState = val; |
| | } |
| |
|
| | void dsgState::saveState( std::vector<std::string> danglingTok, std::vector<int> srcSpans,float deltaValue) |
| | { |
| | buffer = danglingTok; |
| | span=srcSpans; |
| | delta=deltaValue; |
| | } |
| |
|
| |
|
| | size_t dsgState::hash() const |
| | { |
| |
|
| | size_t ret = 0; |
| | boost::hash_combine(ret, lmState); |
| |
|
| | |
| | |
| | |
| | |
| | |
| | } |
| |
|
| | bool dsgState::operator==(const FFState& otherBase) const |
| | { |
| | const dsgState &other = static_cast<const dsgState&>(otherBase); |
| |
|
| | if (lmState < other.lmState) return false; |
| | if (lmState == other.lmState) return true; |
| | return false; |
| | } |
| |
|
| | |
| |
|
| | std::string dsgState :: getName() const |
| | { |
| | return "done"; |
| | } |
| |
|
| | dsgHypothesis :: dsgHypothesis() |
| | { |
| | lmProb = 0; |
| | discontig0 = 0; |
| | discontig1 = 0; |
| | discontig2 = 0; |
| | UnsegWP = 0; |
| | m_buffer.clear(); |
| | } |
| |
|
| | void dsgHypothesis :: setState(const FFState* prev_state) |
| | { |
| | if(prev_state != NULL) { |
| | m_buffer = static_cast <const dsgState *> (prev_state)->getBuffer(); |
| | m_span = static_cast <const dsgState *> (prev_state)->getSpan(); |
| | lmState = static_cast <const dsgState *> (prev_state)->getLMState(); |
| | delta = static_cast <const dsgState *> (prev_state)->getDelta(); |
| | } |
| | } |
| |
|
| | dsgState * dsgHypothesis :: saveState() |
| | { |
| | dsgState * statePtr = new dsgState(lmState); |
| | statePtr->saveState(m_buffer, m_span, delta); |
| | return statePtr; |
| | } |
| |
|
| | void dsgHypothesis :: populateScores(vector <float> & scores , const int numFeatures) |
| | { |
| | scores.clear(); |
| | scores.push_back(lmProb); |
| |
|
| | if (numFeatures == 1) |
| | return; |
| | scores.push_back(discontig0); |
| | scores.push_back(discontig1); |
| | scores.push_back(discontig2); |
| | scores.push_back(UnsegWP); |
| | } |
| |
|
| |
|
| |
|
| | bool dsgHypothesis::isPrefix(const std::string &tok) |
| | { |
| | if ((tok.at(tok.size() - 1) == '+' )&& (tok != "+")) { |
| | return true; |
| | } else { |
| | return false; |
| | }; |
| | } |
| |
|
| | bool dsgHypothesis::isSuffix(const std::string &tok) |
| | { |
| | if ((tok.at(0) == '+' )&& (tok != "+")) { |
| | return true; |
| | } else { |
| | return false; |
| | }; |
| | } |
| |
|
| | bool dsgHypothesis::isStem(const std::string &tok) |
| | { |
| | if ((tok.at(0) != '+') && (tok.at(tok.size() - 1) != '+')) { |
| | return true; |
| | } else { |
| | return false; |
| | }; |
| | } |
| |
|
| |
|
| |
|
| | |
| | |
| | |
| | |
| | |
| | bool dsgHypothesis::isValidChain(const std::string &tok, std::vector<std::string> &chain) |
| | { |
| | std::string last_tok; |
| | if (chain.size() >= 1) { |
| | last_tok = chain[chain.size() - 1]; |
| | } else { |
| | last_tok = "NULL"; |
| | } |
| | if(tok=="+") { |
| | return false; |
| | } |
| | if (isPrefix(tok) && (chain.size() == 0 || isPrefix(last_tok))) { |
| | return true; |
| | } else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok)))) { |
| | return true; |
| | } |
| | |
| | else if (isStem(tok) && (chain.size() == 0 || isPrefix(last_tok))) { |
| | return true; |
| | } else { |
| | return false; |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | vector<string> dsgHypothesis::grouper(std::vector<std::string> &phr_vec,vector<vector<int> > &allchain_ids, int sourceOffset,const AlignmentInfo &align, bool isolation) |
| | { |
| |
|
| | std::vector<std::string> chain; |
| | std::vector<int> chain_ids; |
| | std::vector<std::string> allchains; |
| | chain_ids=m_span; |
| |
|
| | if (!m_buffer.empty() && !isolation) { |
| | for (int i = 0; i < m_buffer.size(); i++) { |
| | chain.push_back(m_buffer[i]); |
| | } |
| | } |
| |
|
| | for (int i = 0; i < phr_vec.size(); i++) { |
| | std::set<std::size_t> sourcePosSet = align.GetAlignmentsForTarget(i); |
| |
|
| | if (isValidChain(phr_vec[i], chain)) { |
| | chain.push_back(phr_vec[i]); |
| | if (sourcePosSet.empty()==false) { |
| | for (std::set<size_t>::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) { |
| | int cur=*it; |
| | chain_ids.push_back(cur+sourceOffset); |
| | } |
| | } |
| | } |
| |
|
| | else if (chain.size() == 0) { |
| | allchains.push_back(phr_vec[i]); |
| | allchain_ids.push_back(chain_ids); |
| | chain_ids.clear(); |
| | } |
| |
|
| | else { |
| | std::string joined = boost::algorithm::join(chain, " "); |
| | allchains.push_back(joined); |
| | allchain_ids.push_back(chain_ids); |
| |
|
| | chain.clear(); |
| | chain_ids.clear(); |
| |
|
| | chain.push_back(phr_vec[i]); |
| | if (sourcePosSet.empty()==false) { |
| | for (std::set<size_t>::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) { |
| | int cur=*it; |
| | chain_ids.push_back(cur+sourceOffset); |
| | } |
| | } |
| |
|
| | } |
| |
|
| | } |
| |
|
| | if (!chain.empty()) { |
| | std::string joined = boost::algorithm::join(chain, " "); |
| | allchains.push_back(joined); |
| | allchain_ids.push_back(chain_ids); |
| | } |
| | return allchains; |
| | } |
| |
|
| |
|
| |
|
| | void dsgHypothesis :: calculateDsgProbinIsol(DsgLM & ptrDsgLM, Desegmenter &desegT, const AlignmentInfo &align ) |
| | { |
| | lmProb = 0; |
| | State currState = lmState; |
| | State temp; |
| | string desegmented=""; |
| | vector <string> words; |
| | vector <string> currFVec; |
| |
|
| | discontig0=0; |
| | discontig1=0; |
| | discontig2=0; |
| | UnsegWP=0; |
| |
|
| | currFVec = m_buffer; |
| | currFVec.insert( currFVec.end(), m_curr_phr.begin(), m_curr_phr.end() ); |
| |
|
| | int vecSize=currFVec.size(); |
| |
|
| | |
| | if (currFVec.size()>0 && isPrefix (currFVec.back())) { |
| | UnsegWP-=0.5; |
| | } |
| | if (currFVec.size()>0 && isSuffix (currFVec.front())) { |
| | UnsegWP-=0.5; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | vector<vector<int> > chain_ids; |
| | words = grouper(currFVec,chain_ids,0,align,1); |
| |
|
| | for (int i = 0; i<words.size(); i++) { |
| | UnsegWP+=1; |
| | temp = currState; |
| | if (words[i].find(" ")!=std::string::npos) { |
| | desegmented=desegT.Search(words[i])[0]; |
| | lmProb += ptrDsgLM.Score(temp,desegmented,currState); |
| | } else { |
| | boost::replace_all(words[i], "-LRB-", "("); |
| | boost::replace_all(words[i], "-RRB-", ")"); |
| | lmProb += ptrDsgLM.Score(temp,words[i],currState); |
| | } |
| | } |
| | lmState = currState; |
| | } |
| |
|
| | void dsgHypothesis :: calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &desegT, bool isCompleted , const AlignmentInfo &align, int sourceOffset, bool optimistic) |
| | { |
| | lmProb = 0; |
| | discontig0=0; |
| | discontig1=0; |
| | discontig2=0; |
| | UnsegWP=0; |
| |
|
| | State currState = lmState; |
| | State temp; |
| | string desegmented=""; |
| | vector <string> words; |
| | vector <string> currFVec; |
| | bool completePhraseSuffixEnd = false; |
| | vector<vector<int> > all_chain_ids; |
| | double pscore; |
| | currFVec=m_curr_phr; |
| |
|
| | |
| | if (isSuffix (currFVec.back()) && (currFVec.back()!="+")) { |
| | completePhraseSuffixEnd=true; |
| | } |
| |
|
| | words = grouper(currFVec,all_chain_ids,sourceOffset,align,0); |
| |
|
| | for (int i = 0; i < words.size(); i++) { |
| | temp = currState; |
| |
|
| | if (i==words.size()-1) { |
| | if (completePhraseSuffixEnd) { |
| | m_buffer.clear(); |
| | m_span.clear(); |
| | } else if (!isCompleted) { |
| | m_buffer.clear(); |
| | if (optimistic == 1) { |
| | if ( isPrefix (currFVec.back())) { |
| | |
| | lmProb -= delta; |
| | delta = 0.0; |
| | } |
| |
|
| | else if (words[i].find(" ")!=std::string::npos) { |
| | desegmented=desegT.Search(words[i])[0]; |
| | pscore=ptrDsgLM.Score(temp,desegmented,currState); |
| | lmProb = lmProb + pscore - delta; |
| | delta=pscore; |
| | currState=temp; |
| | } else { |
| | boost::replace_all(words[i], "-LRB-", "("); |
| | boost::replace_all(words[i], "-RRB-", ")"); |
| | pscore=ptrDsgLM.Score(temp,words[i],currState); |
| | lmProb = lmProb + pscore - delta; |
| | delta=pscore; |
| | currState=temp; |
| | } |
| | } |
| |
|
| | m_buffer.push_back(words.back()); |
| | m_span=all_chain_ids.back(); |
| | break; |
| | } |
| | } |
| |
|
| | |
| | if (words[i].find(" ")!=std::string::npos) { |
| | UnsegWP+=1; |
| | desegmented=desegT.Search(words[i])[0]; |
| | std::set<int> cur_chain_ids(all_chain_ids[i].begin(),all_chain_ids[i].end()); |
| | if (cur_chain_ids.size()>1) { |
| | vector<int> dsc; |
| | for (std::set<int>::iterator it(cur_chain_ids.begin()), next(it); it != cur_chain_ids.end() && ++next != cur_chain_ids.end(); it = next) { |
| | int cur=*it; |
| | int mynext=*next; |
| | if (std::abs(cur - mynext)>= 3) { |
| | dsc.push_back(3); |
| | } else if (std::abs(cur - mynext)== 2) { |
| | dsc.push_back(2); |
| | } else if (std::abs(cur - mynext)<= 1) { |
| | dsc.push_back(1); |
| | } |
| | } |
| | int mymax=*std::max_element(dsc.begin(),dsc.end()); |
| | if (mymax==3) { |
| | discontig2+=1; |
| | } else if (mymax==2) { |
| | discontig1+=1; |
| | } else { |
| | discontig0+=1; |
| | } |
| | } else { |
| | discontig0 += 1; |
| | } |
| |
|
| | lmProb += ptrDsgLM.Score(temp,desegmented,currState); |
| | } else { |
| | UnsegWP+=1; |
| | boost::replace_all(words[i], "-LRB-", "("); |
| | boost::replace_all(words[i], "-RRB-", ")"); |
| | lmProb += ptrDsgLM.Score(temp,words[i],currState); |
| | } |
| | } |
| |
|
| | if (isCompleted) { |
| | temp = currState; |
| | lmProb = lmProb + ptrDsgLM.ScoreEndSentence(temp,currState) - delta; |
| | } |
| | lmState = currState; |
| | } |
| |
|
| |
|
| | void dsgHypothesis :: print() |
| | {} |
| |
|
| |
|
| | } |
| |
|