| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #include <algorithm> |
| | #include <iostream> |
| | #include <string> |
| | #include <vector> |
| | #include <iterator> |
| | #include <cassert> |
| | #include "moses/InputFileStream.h" |
| | #include "moses/Timer.h" |
| | #include "moses/Util.h" |
| | #include "OnDiskWrapper.h" |
| | #include "SourcePhrase.h" |
| | #include "TargetPhrase.h" |
| | #include "TargetPhraseCollection.h" |
| | #include "Word.h" |
| | #include "Vocab.h" |
| | #include "Main.h" |
| |
|
| | using namespace std; |
| | using namespace OnDiskPt; |
| |
|
| | int main (int argc, char * const argv[]) |
| | { |
| | |
| | Moses::ResetUserTime(); |
| | Moses::PrintUserTime("Starting"); |
| |
|
| | if (argc != 8) { |
| | std::cerr << "Usage: " << argv[0] << " numSourceFactors numTargetFactors numScores tableLimit sortScoreIndex inputPath outputPath" << std::endl; |
| | return 1; |
| | } |
| |
|
| | int numSourceFactors = Moses::Scan<int>(argv[1]) |
| | , numTargetFactors = Moses::Scan<int>(argv[2]) |
| | , numScores = Moses::Scan<int>(argv[3]) |
| | , tableLimit = Moses::Scan<int>(argv[4]); |
| | TargetPhraseCollection::s_sortScoreInd = Moses::Scan<int>(argv[5]); |
| | assert(TargetPhraseCollection::s_sortScoreInd < numScores); |
| |
|
| | const string filePath = argv[6] |
| | ,destPath = argv[7]; |
| |
|
| | Moses::InputFileStream inStream(filePath); |
| |
|
| | OnDiskWrapper onDiskWrapper; |
| | onDiskWrapper.BeginSave(destPath, numSourceFactors, numTargetFactors, numScores); |
| |
|
| | PhraseNode &rootNode = onDiskWrapper.GetRootSourceNode(); |
| | size_t lineNum = 0; |
| | string line; |
| |
|
| | while(getline(inStream, line)) { |
| | lineNum++; |
| | if (lineNum%1000 == 0) cerr << "." << flush; |
| | if (lineNum%10000 == 0) cerr << ":" << flush; |
| | if (lineNum%100000 == 0) cerr << lineNum << flush; |
| | |
| |
|
| | std::vector<float> misc(1); |
| | SourcePhrase sourcePhrase; |
| | TargetPhrase *targetPhrase = new TargetPhrase(numScores); |
| | OnDiskPt::PhrasePtr spShort = Tokenize(sourcePhrase, *targetPhrase, line, onDiskWrapper, numScores, misc); |
| | assert(misc.size() == onDiskWrapper.GetNumCounts()); |
| |
|
| | rootNode.AddTargetPhrase(sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, misc, spShort); |
| | } |
| |
|
| | rootNode.Save(onDiskWrapper, 0, tableLimit); |
| | onDiskWrapper.EndSave(); |
| |
|
| | Moses::PrintUserTime("Finished"); |
| |
|
| | |
| | return 0; |
| |
|
| | } |
| |
|
| | bool Flush(const OnDiskPt::SourcePhrase *prevSourcePhrase, const OnDiskPt::SourcePhrase *currSourcePhrase) |
| | { |
| | if (prevSourcePhrase == NULL) |
| | return false; |
| |
|
| | assert(currSourcePhrase); |
| | bool ret = (*currSourcePhrase > *prevSourcePhrase); |
| | |
| |
|
| | return ret; |
| | } |
| |
|
| | OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, const std::string &lineStr, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc) |
| | { |
| | char line[lineStr.size() + 1]; |
| | strcpy(line, lineStr.c_str()); |
| |
|
| | stringstream sparseFeatures, property; |
| |
|
| | size_t scoreInd = 0; |
| |
|
| | |
| | size_t stage = 0; |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | char *tok = strtok (line," "); |
| | OnDiskPt::PhrasePtr out(new Phrase()); |
| | while (tok != NULL) { |
| | if (0 == strcmp(tok, "|||")) { |
| | ++stage; |
| | } else { |
| | switch (stage) { |
| | case 0: { |
| | WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper, 1); |
| | if (w != NULL) |
| | out->AddWord(w); |
| |
|
| | break; |
| | } |
| | case 1: { |
| | Tokenize(targetPhrase, tok, false, true, onDiskWrapper, 0); |
| | break; |
| | } |
| | case 2: { |
| | float score = Moses::Scan<float>(tok); |
| | targetPhrase.SetScore(score, scoreInd); |
| | ++scoreInd; |
| | break; |
| | } |
| | case 3: { |
| | |
| | targetPhrase.CreateAlignFromString(tok); |
| | break; |
| | } |
| | case 4: { |
| | |
| | float val = Moses::Scan<float>(tok); |
| | misc[0] = val; |
| | break; |
| | } |
| | case 5: { |
| | |
| | sparseFeatures << tok << " "; |
| | break; |
| | } |
| | case 6: { |
| | property << tok << " "; |
| | break; |
| | } |
| | default: |
| | cerr << "ERROR in line " << line << endl; |
| | assert(false); |
| | break; |
| | } |
| | } |
| |
|
| | tok = strtok (NULL, " "); |
| | } |
| |
|
| | assert(scoreInd == numScores); |
| | targetPhrase.SetSparseFeatures(Moses::Trim(sparseFeatures.str())); |
| | targetPhrase.SetProperty(Moses::Trim(property.str())); |
| | targetPhrase.SortAlign(); |
| | return out; |
| | } |
| |
|
| | OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase |
| | , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm |
| | , OnDiskPt::OnDiskWrapper &onDiskWrapper, int retSourceTarget) |
| | { |
| | |
| |
|
| | bool nonTerm = false; |
| | size_t tokSize = token.size(); |
| | int comStr =token.compare(0, 1, "["); |
| |
|
| | if (comStr == 0) { |
| | comStr = token.compare(tokSize - 1, 1, "]"); |
| | nonTerm = comStr == 0; |
| | } |
| |
|
| | OnDiskPt::WordPtr out; |
| | if (nonTerm) { |
| | |
| | size_t splitPos = token.find_first_of("[", 2); |
| | string wordStr = token.substr(0, splitPos); |
| |
|
| | if (splitPos == string::npos) { |
| | |
| | WordPtr word(new Word()); |
| | word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); |
| | phrase.AddWord(word); |
| | } else { |
| | |
| | if (addSourceNonTerm) { |
| | WordPtr word(new Word()); |
| | word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); |
| | phrase.AddWord(word); |
| |
|
| | if (retSourceTarget == 1) { |
| | out = word; |
| | } |
| | } |
| |
|
| | wordStr = token.substr(splitPos, tokSize - splitPos); |
| | if (addTargetNonTerm) { |
| | WordPtr word(new Word()); |
| | word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); |
| | phrase.AddWord(word); |
| |
|
| | if (retSourceTarget == 2) { |
| | out = word; |
| | } |
| | } |
| |
|
| | } |
| | } else { |
| | |
| | WordPtr word(new Word()); |
| | word->CreateFromString(token, onDiskWrapper.GetVocab()); |
| | phrase.AddWord(word); |
| | out = word; |
| | } |
| |
|
| | return out; |
| | } |
| |
|
| | void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const ::AlignType &alignments) |
| | { |
| | for (int ind = alignments.size() - 1; ind >= 0; --ind) { |
| | const ::AlignPair &alignPair = alignments[ind]; |
| | size_t sourcePos = alignPair.first |
| | ,targetPos = alignPair.second; |
| |
|
| | const string &target = targetToks[targetPos]; |
| | sourceToks.insert(sourceToks.begin() + sourcePos + 1, target); |
| |
|
| | } |
| | } |
| |
|
| | class AlignOrderer |
| | { |
| | public: |
| | bool operator()(const ::AlignPair &a, const ::AlignPair &b) const { |
| | return a.first < b.first; |
| | } |
| | }; |
| |
|
| | void SortAlign(::AlignType &alignments) |
| | { |
| | std::sort(alignments.begin(), alignments.end(), AlignOrderer()); |
| | } |
| |
|