| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #include <iostream> |
| #include <iomanip> |
| #include <sstream> |
|
|
| #include "phrase-extract.h" |
| #include "ISS.h" |
| |
| #include "SafeGetline.h" |
|
|
|
|
| #define LINE_MAX_LENGTH 60000 |
|
|
|
|
| |
|
|
| |
| typedef std::pair<indexed_phrases_pair_t, PhrasePairsLossyCounter::frequency_t> output_pair_t; |
| typedef std::vector<output_pair_t> output_vector_t; |
|
|
| class PhraseComp { |
| |
| bool _inverted; |
|
|
| bool compareAlignments(const indexed_phrases_pair_t& a, const indexed_phrases_pair_t& b); |
|
|
| int comparePhrases(const indexed_phrases_pair_t::phrase_t& a, const indexed_phrases_pair_t::phrase_t& b); |
|
|
| public: |
| PhraseComp(bool inverted): _inverted(inverted) {} |
|
|
| bool operator()(const output_pair_t& a, const output_pair_t& b); |
| }; |
|
|
| void processSortedOutput(OutputProcessor& processor); |
|
|
| void processUnsortedOutput(OutputProcessor& processor); |
|
|
| void flushPhrasePair(OutputProcessor& processor, const indexed_phrases_pair_t& indexedPhrasePair, PhrasePairsLossyCounter::frequency_t frequency, int mode); |
|
|
|
|
| |
| bool allModelsOutputFlag = false; |
|
|
| bool wordModel = false; |
| REO_MODEL_TYPE wordType = REO_MSD; |
| bool phraseModel = false; |
| REO_MODEL_TYPE phraseType = REO_MSD; |
| bool hierModel = false; |
| REO_MODEL_TYPE hierType = REO_MSD; |
|
|
| int maxPhraseLength = 0; |
| bool translationFlag = true; |
| bool orientationFlag = false; |
| bool sortedOutput = false; |
|
|
| LossyCountersVector lossyCounters; |
|
|
| #ifdef GET_COUNTS_ONLY |
| std::vector<size_t> phrasePairsCounters; |
| #endif |
|
|
|
|
| |
|
|
| IndexedStringsStorage<word_index_t> strings; |
| IndexedStringsStorage<orientation_info_index_t> orientations; |
|
|
|
|
| |
|
|
| REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelType, |
| bool connectedLeftTop, bool connectedRightTop, |
| int startF, int endF, int startE, int endE, int countF, int zero, int unit, |
| bool (*ge)(int, int), bool (*lt)(int, int) ) |
| { |
|
|
| if( connectedLeftTop && !connectedRightTop) |
| return LEFT; |
| if(modelType == REO_MONO) |
| return UNKNOWN; |
| if (!connectedLeftTop && connectedRightTop) |
| return RIGHT; |
| if(modelType == REO_MSD) |
| return UNKNOWN; |
| for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit) |
| connectedLeftTop = isAligned(sentence, indexF, startE-unit); |
| for(int indexF=endF+2*unit; (*lt)(indexF,countF) && !connectedRightTop; indexF=indexF+unit) |
| connectedRightTop = isAligned(sentence, indexF, startE-unit); |
| if(connectedLeftTop && !connectedRightTop) |
| return DRIGHT; |
| else if(!connectedLeftTop && connectedRightTop) |
| return DLEFT; |
| return UNKNOWN; |
| } |
|
|
| |
| REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType, |
| bool connectedLeftTop, bool connectedRightTop, |
| int startF, int endF, int startE, int endE, int countF, int zero, int unit, |
| bool (*ge)(int, int), bool (*lt)(int, int), |
| const HSentenceVertices & inBottomRight, const HSentenceVertices & inBottomLeft) |
| { |
|
|
| HSentenceVertices::const_iterator it; |
|
|
| if((connectedLeftTop && !connectedRightTop) || |
| |
| |
| ((it = inBottomRight.find(startE - unit)) != inBottomRight.end() && |
| it->second.find(startF-unit) != it->second.end())) |
| return LEFT; |
| if(modelType == REO_MONO) |
| return UNKNOWN; |
| if((!connectedLeftTop && connectedRightTop) || |
| ((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && it->second.find(endF + unit) != it->second.end())) |
| return RIGHT; |
| if(modelType == REO_MSD) |
| return UNKNOWN; |
| connectedLeftTop = false; |
| for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit) |
| if(connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() && |
| it->second.find(indexF) != it->second.end()) |
| return DRIGHT; |
| connectedRightTop = false; |
| for(int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit) |
| if(connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomRight.end() && |
| it->second.find(indexF) != it->second.end()) |
| return DLEFT; |
| return UNKNOWN; |
| } |
|
|
| |
| REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType, |
| bool connectedLeftTop, bool connectedRightTop, |
| int startF, int endF, int startE, int endE, int countF, int zero, int unit, |
| bool (*ge)(int, int), bool (*lt)(int, int), |
| const HSentenceVertices & inBottomRight, const HSentenceVertices & inBottomLeft, |
| const HSentenceVertices & outBottomRight, const HSentenceVertices & outBottomLeft, |
| REO_POS phraseOrient) |
| { |
|
|
| HSentenceVertices::const_iterator it; |
|
|
| if(phraseOrient == LEFT || |
| (connectedLeftTop && !connectedRightTop) || |
| |
| |
| ((it = inBottomRight.find(startE - unit)) != inBottomRight.end() && |
| it->second.find(startF-unit) != it->second.end()) || |
| ((it = outBottomRight.find(startE - unit)) != outBottomRight.end() && |
| it->second.find(startF-unit) != it->second.end())) |
| return LEFT; |
| if(modelType == REO_MONO) |
| return UNKNOWN; |
| if(phraseOrient == RIGHT || |
| (!connectedLeftTop && connectedRightTop) || |
| ((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && |
| it->second.find(endF + unit) != it->second.end()) || |
| ((it = outBottomLeft.find(startE - unit)) != outBottomLeft.end() && |
| it->second.find(endF + unit) != it->second.end())) |
| return RIGHT; |
| if(modelType == REO_MSD) |
| return UNKNOWN; |
| if(phraseOrient != UNKNOWN) |
| return phraseOrient; |
| connectedLeftTop = false; |
| for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit) { |
| if((connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() && |
| it->second.find(indexF) != it->second.end()) || |
| (connectedLeftTop = (it = outBottomRight.find(startE - unit)) != outBottomRight.end() && |
| it->second.find(indexF) != it->second.end())) |
| return DRIGHT; |
| } |
| connectedRightTop = false; |
| for(int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit) { |
| if((connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomRight.end() && |
| it->second.find(indexF) != it->second.end()) || |
| (connectedRightTop = (it = outBottomLeft.find(startE - unit)) != outBottomRight.end() && |
| it->second.find(indexF) != it->second.end())) |
| return DLEFT; |
| } |
| return UNKNOWN; |
| } |
|
|
| void insertVertex( HSentenceVertices & corners, int x, int y ) |
| { |
| std::set<int> tmp; |
| tmp.insert(x); |
| std::pair< HSentenceVertices::iterator, bool > ret = corners.insert( std::pair<int, std::set<int> > (y, tmp) ); |
| if(ret.second == false) { |
| ret.first->second.insert(x); |
| } |
| } |
|
|
| void insertPhraseVertices( |
| HSentenceVertices & topLeft, |
| HSentenceVertices & topRight, |
| HSentenceVertices & bottomLeft, |
| HSentenceVertices & bottomRight, |
| int startF, int startE, int endF, int endE) |
| { |
|
|
| insertVertex(topLeft, startF, startE); |
| insertVertex(topRight, endF, startE); |
| insertVertex(bottomLeft, startF, endE); |
| insertVertex(bottomRight, endF, endE); |
| } |
|
|
| std::string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType) |
| { |
| switch(orient) { |
| case LEFT: |
| return "mono"; |
| break; |
| case RIGHT: |
| return "swap"; |
| break; |
| case DRIGHT: |
| return "dright"; |
| break; |
| case DLEFT: |
| return "dleft"; |
| break; |
| case UNKNOWN: |
| switch(modelType) { |
| case REO_MONO: |
| return "nomono"; |
| break; |
| case REO_MSD: |
| return "other"; |
| break; |
| case REO_MSLR: |
| return "dright"; |
| break; |
| } |
| break; |
| } |
| } |
|
|
| bool ge(int first, int second) |
| { |
| return first >= second; |
| } |
|
|
| bool le(int first, int second) |
| { |
| return first <= second; |
| } |
|
|
| bool lt(int first, int second) |
| { |
| return first < second; |
| } |
|
|
| bool isAligned ( SentenceAlignment &sentence, int fi, int ei ) |
| { |
| if (ei == -1 && fi == -1) |
| return true; |
| if (ei <= -1 || fi <= -1) |
| return false; |
| if (ei == sentence.target.size() && fi == sentence.source.size()) |
| return true; |
| if (ei >= sentence.target.size() || fi >= sentence.source.size()) |
| return false; |
| for(int i=0; i<sentence.alignedToT[ei].size(); i++) |
| if (sentence.alignedToT[ei][i] == fi) |
| return true; |
| return false; |
| } |
|
|
| |
|
|
|
|
| |
|
|
| void extract(SentenceAlignment &sentence) { |
|
|
| int countE = sentence.target.size(); |
| int countF = sentence.source.size(); |
|
|
| HPhraseVector inboundPhrases; |
|
|
| HSentenceVertices inTopLeft; |
| HSentenceVertices inTopRight; |
| HSentenceVertices inBottomLeft; |
| HSentenceVertices inBottomRight; |
|
|
| HSentenceVertices outTopLeft; |
| HSentenceVertices outTopRight; |
| HSentenceVertices outBottomLeft; |
| HSentenceVertices outBottomRight; |
|
|
| HSentenceVertices::const_iterator it; |
|
|
| bool relaxLimit = hierModel; |
| bool buildExtraStructure = phraseModel || hierModel; |
|
|
| |
| |
| for (int startE = 0; startE < countE; startE++) { |
| for ( |
| int endE = startE; |
| ((endE < countE) && (relaxLimit || (endE < (startE + maxPhraseLength)))); |
| endE++ |
| ) { |
|
|
| int minF = 9999; |
| int maxF = -1; |
| std::vector< int > usedF = sentence.alignedCountS; |
|
|
| for (int ei = startE; ei <= endE; ei++) { |
| for (int i = 0; i < sentence.alignedToT[ei].size(); i++) { |
| int fi = sentence.alignedToT[ei][i]; |
| if (fi < minF) { |
| minF = fi; |
| } |
| if (fi > maxF) { |
| maxF = fi; |
| } |
| usedF[ fi ]--; |
| } |
| } |
|
|
| if (maxF >= 0 && |
| (relaxLimit || maxF-minF < maxPhraseLength)) { |
|
|
| |
| bool out_of_bounds = false; |
|
|
| for (int fi=minF; fi<=maxF && !out_of_bounds; fi++) { |
| if (usedF[fi]>0) { |
| |
| out_of_bounds = true; |
| } |
| } |
|
|
| |
| if (!out_of_bounds) { |
| |
| for (int startF=minF; |
| (startF>=0 && |
| (relaxLimit || startF>maxF-maxPhraseLength) && |
| (startF==minF || sentence.alignedCountS[startF]==0)); |
| startF-- |
| ) |
| |
| for (int endF=maxF; |
| (endF<countF && |
| (relaxLimit || endF<startF+maxPhraseLength) && |
| (endF==maxF || sentence.alignedCountS[endF]==0)); |
| endF++ |
| ) { |
| if (buildExtraStructure) { |
| if (endE-startE < maxPhraseLength && endF-startF < maxPhraseLength) { |
| inboundPhrases.push_back( |
| HPhrase(HPhraseVertex(startF,startE), HPhraseVertex(endF,endE)) |
| ); |
| insertPhraseVertices( |
| inTopLeft, inTopRight, inBottomLeft, inBottomRight, |
| startF, startE, endF, endE |
| ); |
| } else { |
| insertPhraseVertices( |
| outTopLeft, outTopRight, outBottomLeft, outBottomRight, |
| startF, startE, endF, endE |
| ); |
| } |
| } else { |
| std::string orientationInfo = ""; |
| if (orientationFlag && wordModel) { |
| REO_POS wordPrevOrient, wordNextOrient; |
| bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 ); |
| bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 ); |
| bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 ); |
| bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 ); |
| wordPrevOrient = getOrientWordModel(sentence, wordType, connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, <); |
| wordNextOrient = getOrientWordModel(sentence, wordType, connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, <, &ge); |
| orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType); |
| } |
| addPhrase(sentence, startE, endE, startF, endF, orientationInfo); |
| } |
| } |
| } |
| } |
| } |
| } |
|
|
| if (buildExtraStructure) { |
| std::string orientationInfo = ""; |
| REO_POS wordPrevOrient, wordNextOrient, phrasePrevOrient, phraseNextOrient, hierPrevOrient, hierNextOrient; |
|
|
| for (int i = 0; i < inboundPhrases.size(); i++) { |
| int startF = inboundPhrases[i].first.first; |
| int startE = inboundPhrases[i].first.second; |
| int endF = inboundPhrases[i].second.first; |
| int endE = inboundPhrases[i].second.second; |
|
|
| if ( orientationFlag ) { |
|
|
| bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 ); |
| bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 ); |
| bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 ); |
| bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 ); |
|
|
| if (wordModel) { |
| wordPrevOrient = getOrientWordModel(sentence, wordType, |
| connectedLeftTopP, connectedRightTopP, |
| startF, endF, startE, endE, countF, 0, 1, |
| &ge, <); |
|
|
| wordNextOrient = getOrientWordModel(sentence, wordType, |
| connectedLeftTopN, connectedRightTopN, |
| endF, startF, endE, startE, 0, countF, -1, |
| <, &ge); |
| } |
| if (phraseModel) { |
| phrasePrevOrient = getOrientPhraseModel(sentence, phraseType, |
| connectedLeftTopP, connectedRightTopP, |
| startF, endF, startE, endE, countF-1, 0, 1, &ge, <, inBottomRight, inBottomLeft); |
| phraseNextOrient = getOrientPhraseModel(sentence, phraseType, |
| connectedLeftTopN, connectedRightTopN, |
| endF, startF, endE, startE, 0, countF-1, -1, <, &ge, inBottomLeft, inBottomRight); |
| } else { |
| phrasePrevOrient = phraseNextOrient = UNKNOWN; |
| } |
| if(hierModel) { |
| hierPrevOrient = getOrientHierModel(sentence, hierType, |
| connectedLeftTopP, connectedRightTopP, |
| startF, endF, startE, endE, countF-1, 0, 1, &ge, <, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient); |
| hierNextOrient = getOrientHierModel(sentence, hierType, |
| connectedLeftTopN, connectedRightTopN, |
| endF, startF, endE, startE, 0, countF-1, -1, <, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient); |
| } |
|
|
| orientationInfo = ((wordModel)? getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) : "") + " | " + |
| ((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : "") + " | " + |
| ((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : ""); |
| } |
|
|
| addPhrase(sentence, startE, endE, startF, endF, orientationInfo); |
|
|
| } |
|
|
| } |
|
|
| } |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| void addPhrase(SentenceAlignment &sentence, int startE, int endE, int startF, int endF, std::string &orientationInfo) { |
|
|
| #ifdef GET_COUNTS_ONLY |
| |
| phrasePairsCounters[std::max(endF - startF, endE - startE) + 1] += 1; |
| #else |
| alignment_t alignment; |
|
|
| |
| for (int ei = startE; ei <= endE; ++ei) { |
| for (int i = 0; i < sentence.alignedToT[ei].size(); ++i) { |
| int fi = sentence.alignedToT[ei][i]; |
| alignment.push_back(alignment_t::value_type(fi-startF, ei-startE)); |
| } |
| } |
|
|
| indexed_phrases_pair_t::phrase_t srcPhraseIndices, tgtPhraseIndices; |
|
|
| |
| for (int fi = startF; fi <= endF; ++fi) { |
| srcPhraseIndices.push_back(strings.put(sentence.source[fi].c_str())); |
| } |
|
|
| |
| for (int ei = startE; ei <= endE; ++ei) { |
| tgtPhraseIndices.push_back(strings.put(sentence.target[ei].c_str())); |
| } |
|
|
| |
| size_t idx = std::max(srcPhraseIndices.size(), tgtPhraseIndices.size()); |
|
|
| |
| lossyCounters[idx]->lossyCounter.add(indexed_phrases_pair_t(srcPhraseIndices, tgtPhraseIndices, orientations.put(orientationInfo.c_str()), alignment)); |
| |
| if ( lossyCounters[idx]->lossyCounter.aboutToPrune() ) { |
| |
| std::cerr << 'P' << idx << std::flush; |
| } |
| #endif |
| } |
|
|
|
|
| |
|
|
| void readInput(std::istream& eFile, std::istream& fFile, std::istream& aFile) { |
|
|
| |
| char englishString[LINE_MAX_LENGTH]; |
| char foreignString[LINE_MAX_LENGTH]; |
| char alignmentString[LINE_MAX_LENGTH]; |
|
|
| int i = 0; |
|
|
| while(true) { |
| |
| if (++i%10000 == 0) std::cerr << "." << std::flush; |
|
|
| SAFE_GETLINE(eFile, englishString, LINE_MAX_LENGTH, '\n', __FILE__); |
| if (eFile.eof()) break; |
| SAFE_GETLINE(fFile, foreignString, LINE_MAX_LENGTH, '\n', __FILE__); |
| SAFE_GETLINE(aFile, alignmentString, LINE_MAX_LENGTH, '\n', __FILE__); |
|
|
| SentenceAlignment sentence; |
|
|
| if (sentence.create(englishString, foreignString, alignmentString, i)) { |
| extract(sentence); |
| } |
| } |
|
|
| } |
|
|
|
|
| void processOutput(OutputProcessor& processor) { |
| if ( sortedOutput ) { |
| processSortedOutput(processor); |
| } |
| else { |
| processUnsortedOutput(processor); |
| } |
| } |
|
|
|
|
| bool PhraseComp::operator()(const output_pair_t& a, const output_pair_t& b) { |
|
|
| int cmp = _inverted ? comparePhrases(a.first.tgtPhrase(), b.first.tgtPhrase()) : comparePhrases(a.first.srcPhrase(), b.first.srcPhrase()); |
|
|
| if ( cmp == 0 ) { |
| |
| cmp = _inverted ? comparePhrases(a.first.srcPhrase(), b.first.srcPhrase()) : comparePhrases(a.first.tgtPhrase(), b.first.tgtPhrase()); |
|
|
| if ( cmp == 0 ) { |
| |
| return compareAlignments(a.first, b.first); |
| } |
| else { |
| return cmp < 0; |
| } |
| } |
| else { |
| return cmp < 0; |
| } |
|
|
| } |
|
|
|
|
| bool PhraseComp::compareAlignments(const indexed_phrases_pair_t& a, const indexed_phrases_pair_t& b) { |
|
|
| size_t aSize = a.alignmentLength(); |
| size_t bSize = b.alignmentLength(); |
| size_t min = std::min(aSize, bSize); |
| const indexed_phrases_pair_t::alignment_point_t * aAlignment = a.alignmentData(); |
| const indexed_phrases_pair_t::alignment_point_t * bAlignment = b.alignmentData(); |
|
|
| int cmp = 0; |
| for ( size_t i = 0; i < min; ++i ) { |
| |
| if ( _inverted ) { |
| |
| cmp = memcmp(aAlignment + i*2 + 1, bAlignment + i*2 + 1, sizeof(indexed_phrases_pair_t::alignment_point_t)); |
| } |
| else{ |
| |
| cmp = memcmp(aAlignment+ i*2, bAlignment + i*2, sizeof(indexed_phrases_pair_t::alignment_point_t)); |
| } |
| if ( cmp == 0 ) { |
| if ( _inverted ) { |
| |
| cmp = memcmp(aAlignment + i*2, bAlignment + i*2, sizeof(indexed_phrases_pair_t::alignment_point_t)); |
| } |
| else{ |
| |
| cmp = memcmp(aAlignment + i*2 + 1, bAlignment + i*2 + 1, sizeof(indexed_phrases_pair_t::alignment_point_t)); |
| } |
| if ( cmp != 0 ) { |
| return cmp < 0; |
| } |
| } |
| else { |
| return cmp < 0; |
| } |
| } |
|
|
| |
| return (cmp == 0) ? (aSize < bSize) : (cmp < 0); |
|
|
| } |
|
|
|
|
| int PhraseComp::comparePhrases(const indexed_phrases_pair_t::phrase_t& a, const indexed_phrases_pair_t::phrase_t& b) { |
|
|
| size_t aSize = a.size(); |
| size_t bSize = b.size(); |
| size_t min = std::min(aSize, bSize); |
| int cmp = 0; |
|
|
| for ( size_t i = 0; i < min; ++i ) { |
| cmp = strcmp(strings.get(a[i]), strings.get(b[i])); |
| if ( cmp != 0 ) { |
| return cmp; |
| } |
| } |
|
|
| if ( aSize == bSize ) { |
| return 0; |
| } |
|
|
| if ( aSize < bSize ) { |
| return strcmp("|||", strings.get(b[min])); |
| } |
| else { |
| return strcmp(strings.get(a[min]), "|||"); |
| } |
|
|
| } |
|
|
|
|
| void processSortedOutput(OutputProcessor& processor) { |
|
|
| output_vector_t output; |
|
|
| LossyCountersVector::value_type current = NULL, prev = NULL; |
|
|
| for ( size_t i = 1; i < lossyCounters.size(); ++i ) { |
| current = lossyCounters[i]; |
| if ( current != prev ) { |
| PhrasePairsLossyCounter& lossyCounter = current->lossyCounter; |
| for ( PhrasePairsLossyCounter::erasing_iterator phraseIter = lossyCounter.beginErase(); phraseIter != lossyCounter.endErase(); ++phraseIter ) { |
| |
| output.push_back(std::make_pair(phraseIter.item(), phraseIter.frequency())); |
| |
| current->outputMass += phraseIter.frequency(); |
| current->outputSize += 1; |
| } |
| |
| prev = current; |
| |
| } |
| } |
|
|
| |
| std::sort(output.begin(), output.end(), PhraseComp(false)); |
|
|
| |
| for ( output_vector_t::const_iterator iter = output.begin(); iter != output.end(); ++iter ) { |
| flushPhrasePair(processor, iter->first, iter->second, 1); |
| } |
|
|
| |
| std::sort(output.begin(), output.end(), PhraseComp(true)); |
|
|
| |
| for ( output_vector_t::const_iterator iter = output.begin(); iter != output.end(); ++iter ) { |
| flushPhrasePair(processor, iter->first, iter->second, -1); |
| } |
|
|
| } |
|
|
|
|
| void processUnsortedOutput(OutputProcessor& processor) { |
|
|
| LossyCountersVector::value_type current = NULL, prev = NULL; |
|
|
| for ( size_t i = 1; i < lossyCounters.size(); ++i ) { |
|
|
| current = lossyCounters[i]; |
|
|
| if ( current != prev ) { |
|
|
| const PhrasePairsLossyCounter& lossyCounter = current->lossyCounter; |
|
|
| for ( PhrasePairsLossyCounter::const_iterator phraseIter = lossyCounter.begin(); phraseIter != lossyCounter.end(); ++phraseIter ) { |
| |
| flushPhrasePair(processor, phraseIter.item(), phraseIter.frequency(), 0); |
| |
| current->outputMass += phraseIter.frequency(); |
| current->outputSize += 1; |
| } |
|
|
| |
| prev = current; |
| } |
| } |
|
|
| } |
|
|
|
|
| void flushPhrasePair(OutputProcessor& processor, const indexed_phrases_pair_t& indexedPhrasePair, PhrasePairsLossyCounter::frequency_t frequency, int mode = 0) { |
|
|
| const indexed_phrases_pair_t::phrase_t srcPhraseIndices = indexedPhrasePair.srcPhrase(); |
| const indexed_phrases_pair_t::phrase_t tgtPhraseIndices = indexedPhrasePair.tgtPhrase(); |
|
|
| std::string srcPhrase, tgtPhrase; |
|
|
| for ( indexed_phrases_pair_t::phrase_t::const_iterator indexIter = srcPhraseIndices.begin(); indexIter != srcPhraseIndices.end(); ++indexIter ) { |
| srcPhrase += std::string(strings.get(*indexIter)) + " "; |
| } |
| srcPhrase.resize(srcPhrase.size() - 1); |
|
|
| for ( indexed_phrases_pair_t::phrase_t::const_iterator indexIter = tgtPhraseIndices.begin(); indexIter != tgtPhraseIndices.end(); ++indexIter ) { |
| tgtPhrase += std::string(strings.get(*indexIter)) + " "; |
| } |
| tgtPhrase.resize(tgtPhrase.size() - 1); |
|
|
| |
| processor(srcPhrase, tgtPhrase, orientations.get(indexedPhrasePair.orientationInfo()), indexedPhrasePair.alignment(), frequency, mode); |
| } |
|
|
|
|
| void printStats(void) { |
|
|
| |
| size_t outputMass = 0, outputSize = 0, N = 0; |
|
|
| const std::string hline = "####################################################################################################################"; |
|
|
| std::cerr << "Lossy Counting Phrase Extraction statistics:" << std::endl; |
|
|
| |
| std::cerr |
| << hline << std::endl |
| << "# length # unique out # total out # total in (N) # out/in (%) # pos. thr. # neg. thr. # max. err. #" << std::endl |
| << hline << std::endl; |
|
|
| LossyCountersVector::value_type current = NULL, prev = NULL; |
| size_t from = 1, to = 1; |
|
|
| for ( size_t i = 1; i <= lossyCounters.size(); ++i ) { |
|
|
| current = (i < lossyCounters.size()) ? lossyCounters[i] : NULL; |
|
|
| if ( (current == NULL) || ((current != prev) && (prev != NULL)) ) { |
| |
| to = i-1; |
|
|
| |
| outputMass += prev->outputMass; |
| outputSize += prev->outputSize; |
| N += prev->lossyCounter.count(); |
|
|
| |
| if ( from == to ) { |
| std::cerr << "# " << std::setw(6) << to << " # "; |
| } |
| else { |
| std::stringstream strStr; |
| strStr << from << "-" << to; |
| std::cerr << "# " << std::setw(6) << strStr.str() << " # "; |
| } |
| |
| std::cerr |
| << std::setw(15) << prev->outputSize << " # " |
| << std::setw(15) << prev->outputMass << " # " |
| << std::setw(15) << prev->lossyCounter.count() << " # " |
| << std::setw(10) << std::setprecision(4) << (static_cast<double>(prev->outputMass) / static_cast<double>(prev->lossyCounter.count())) * 100 << " # " |
| << std::setw(10) << prev->lossyCounter.threshold(true) << " # " |
| << std::setw(10) << prev->lossyCounter.threshold() << " # " |
| << std::setw(10) << prev->lossyCounter.maxError() << " #" |
| << std::endl << hline << std::endl; |
|
|
| from = i; |
| } |
|
|
| prev = current; |
|
|
| } |
|
|
| |
| std::cerr |
| << "# TOTAL # " |
| << std::setw(15) << outputSize << " # " |
| << std::setw(15) << outputMass << " # " |
| << std::setw(15) << N << " # " |
| << std::setw(10) << std::setprecision(4) << (static_cast<double>(outputMass) / static_cast<double>(N)) * 100 << " #" |
| << std::endl |
| << "#############################################################################" << std::endl; |
|
|
| } |
|
|