| | |
| | |
| | |
| |
|
| | #include "phrasetable.h" |
| | #include "statistic.h" |
| | #include "timestamp.h" |
| |
|
| | #include <iostream> |
| | #include <limits> |
| | #include <sstream> |
| | #include <string> |
| |
|
| | |
| |
|
| | PhraseText::DictionaryType_ PhraseText::dictionary_; |
| | Count PhraseText::last_id_ = 1; |
| |
|
| | PhraseText::PhraseText(const String &s) |
| | { |
| | IStringStream is(s); |
| | while(is.good()) { |
| | String w; |
| | getline(is, w, ' '); |
| | Count *id = boost::fast_pool_allocator<Count>::allocate(1); |
| | *id = index_word(w); |
| | word_list_.push_back(id); |
| | } |
| | } |
| |
|
| | std::ostream &operator<<(std::ostream &os, const PhraseText &pt) |
| | { |
| | bool print_space = false; |
| | for(PhraseText::const_string_iterator it = pt.string_begin(); it != pt.string_end(); it++) { |
| | if(print_space) |
| | os << ' '; |
| | else |
| | print_space = true; |
| |
|
| | os << *it; |
| | } |
| |
|
| | return os; |
| | } |
| |
|
| | |
| |
|
| | PhraseAlignment::Alignment::AlignmentMapType_ PhraseAlignment::Alignment::alignment_map_; |
| | PhraseAlignment::Alignment::AlignmentVectorType_ PhraseAlignment::Alignment::alignment_vector_; |
| |
|
| | PhraseAlignment::Alignment::Alignment(Count slen, Count tlen, const String &alignment) : |
| | slen_(slen), tlen_(tlen), matrix_(slen * tlen, false) |
| | { |
| | assert(slen_ > 0 && slen_ < 10); |
| | IStringStream is(alignment); |
| | while(is.good()) { |
| | String a; |
| | getline(is, a, ' '); |
| | IStringStream ap(a); |
| | Count s, t; |
| | char dash; |
| | ap >> s >> dash >> t; |
| | assert(s < slen && t < tlen); |
| | assert(dash == '-'); |
| | matrix_[t * slen + s] = true; |
| | } |
| | } |
| |
|
| | Count PhraseAlignment::Alignment::index_alignment(Count slen, Count tlen, const String &alignment) |
| | { |
| | AlignmentTuple_ tup = boost::make_tuple(slen, tlen, alignment); |
| | AlignmentMapType_::const_iterator it = alignment_map_.find(tup); |
| |
|
| | if(it == alignment_map_.end()) { |
| | const Alignment *pa = new Alignment(slen, tlen, alignment); |
| | Count index = alignment_vector_.size(); |
| | alignment_map_.insert(std::make_pair(tup, index)); |
| | alignment_vector_.push_back(pa); |
| | return index; |
| | } else |
| | return it->second; |
| | } |
| |
|
| | std::ostream &operator<<(std::ostream &os, const PhraseAlignment::Alignment &pa) |
| | { |
| | bool print_space = false; |
| | for(Count i = 0; i < pa.matrix_.size(); i++) { |
| | if(print_space) |
| | os << ' '; |
| | else |
| | print_space = true; |
| |
|
| | os << (i / pa.slen_) << '-' << (i % pa.slen_); |
| | } |
| |
|
| | return os; |
| | } |
| |
|
| | std::ostream &operator<<(std::ostream &os, const PhraseAlignment &pa) |
| | { |
| | for(Count s = 0; s < pa.get_source_length(); s++) { |
| | os << '('; |
| | bool print_comma = false; |
| | for(Count t = 0; t < pa.get_target_length(); t++) { |
| | if(pa.is_aligned(s, t)) { |
| | if(print_comma) |
| | os << ','; |
| | else |
| | print_comma = true; |
| |
|
| | os << t; |
| | } |
| | } |
| | os << ") "; |
| | } |
| |
|
| | os << "|||"; |
| |
|
| | for(Count t = 0; t < pa.get_target_length(); t++) { |
| | os << " ("; |
| | bool print_comma = false; |
| | for(Count s = 0; s < pa.get_source_length(); s++) { |
| | if(pa.is_aligned(s, t)) { |
| | if(print_comma) |
| | os << ','; |
| | else |
| | print_comma = true; |
| |
|
| | os << s; |
| | } |
| | } |
| | os << ')'; |
| | } |
| |
|
| | return os; |
| | } |
| |
|
| | |
| |
|
| | bool PhrasePairInfo::init_phase_ = true; |
| | Count PhrasePairInfo::data_ncounts_ = COUNT_FREE_IDX; |
| | Count PhrasePairInfo::data_nscores_ = SCORE_FREE_IDX; |
| | const Count PhrasePairInfo::CONTINUATION_BIT = 1 << (std::numeric_limits<Count>::digits - 1); |
| |
|
| | PhrasePairInfo::PhrasePairInfo(Count src, Count tgt, Count alignment, Count count) : |
| | src_(src), tgt_(tgt), data_(NULL), reverse_(false) |
| | { |
| | init_phase_ = false; |
| | realloc_data(1); |
| | count_data(COUNT_COUNT_IDX) = count; |
| | Count *aligd = alignment_data(0); |
| | aligd[0] = alignment; |
| | aligd[1] = count; |
| | } |
| |
|
| | DataIndex PhrasePairInfo::register_score_data(Count size) |
| | { |
| | assert(init_phase_); |
| |
|
| | Count start = data_nscores_; |
| | data_nscores_ += size; |
| | return start; |
| | } |
| |
|
| | DataIndex PhrasePairInfo::register_count_data(Count size) |
| | { |
| | assert(init_phase_); |
| |
|
| | Count start = data_ncounts_; |
| | data_ncounts_ += size; |
| | return start; |
| | } |
| |
|
| | PhrasePairInfo::AlignmentVector PhrasePairInfo::get_alignments() const |
| | { |
| | PhrasePairInfo::AlignmentVector vec; |
| |
|
| | Count i = 0; |
| | bool last; |
| | do { |
| | const Count *aligd = alignment_data(i++); |
| | last = !(aligd[0] & CONTINUATION_BIT); |
| | Count alig = aligd[0] & ~CONTINUATION_BIT; |
| | vec.push_back(std::make_pair(PhraseAlignment(alig, reverse_), aligd[1])); |
| | } while(!last); |
| |
|
| | return vec; |
| | } |
| |
|
| | void PhrasePairInfo::add_alignment(Count new_alignment) |
| | { |
| | Count i = 0; |
| | bool last; |
| | do { |
| | Count *aligd = alignment_data(i++); |
| | last = !(aligd[0] & CONTINUATION_BIT); |
| | Count alig = aligd[0] & ~CONTINUATION_BIT; |
| | if(alig == new_alignment) { |
| | aligd[1]++; |
| | return; |
| | } |
| | } while(!last); |
| |
|
| | realloc_data(i + 1); |
| |
|
| | Count *last_aligd = alignment_data(i - 1); |
| | last_aligd[0] |= CONTINUATION_BIT; |
| |
|
| | Count *this_aligd = alignment_data(i); |
| | this_aligd[0] = new_alignment; |
| | this_aligd[1] = 1; |
| | } |
| |
|
| | void PhrasePairInfo::realloc_data(Count nalignments) |
| | { |
| | static boost::pool<> *pool[3] = { NULL, NULL, NULL }; |
| |
|
| | size_t fixed_size = data_nscores_ * sizeof(Score) + data_ncounts_ * sizeof(Count); |
| | size_t new_data_size = fixed_size + COUNTS_PER_ALIGNMENT * nalignments * sizeof(Count); |
| |
|
| | PhrasePairData new_data; |
| | if(nalignments <= 3) { |
| | if(!pool[nalignments - 1]) |
| | pool[nalignments - 1] = new boost::pool<>(new_data_size); |
| |
|
| | new_data = reinterpret_cast<PhrasePairData>(pool[nalignments - 1]->malloc()); |
| | } else |
| | new_data = new char[new_data_size]; |
| |
|
| | if(data_) { |
| | memcpy(new_data, data_, fixed_size); |
| | Count i = 0; |
| | Count *old_aligd, *new_aligd; |
| | do { |
| | assert(i < nalignments); |
| | old_aligd = alignment_data(data_, i); |
| | new_aligd = alignment_data(new_data, i); |
| | new_aligd[0] = old_aligd[0]; |
| | new_aligd[1] = old_aligd[1]; |
| | i++; |
| | } while(old_aligd[0] & CONTINUATION_BIT); |
| | if(nalignments <= 4) |
| | pool[nalignments - 2]->free(data_); |
| | else |
| | delete[] data_; |
| | } |
| |
|
| | data_ = new_data; |
| | } |
| |
|
| | |
| |
|
| | Phrase PhraseInfoList::index_phrase(const String &s_phr) |
| | { |
| | IDMapType_::const_iterator it = idmap_.find(s_phr); |
| | if(it != idmap_.end()) |
| | return it->second; |
| |
|
| | PhraseInfo *pi = phrase_info_pool_.construct(data_size_, s_phr); |
| |
|
| | list_.push_back(pi); |
| | idmap_[s_phr] = list_.size() - 1; |
| | return idmap_[s_phr]; |
| | } |
| |
|
| | DataIndex PhraseInfoList::register_data(Count size) |
| | { |
| | DataIndex start = data_size_; |
| | data_size_ += size; |
| | return start; |
| | } |
| |
|
| | void PhraseInfoList::attach_statistic(PhraseStatistic &s) |
| | { |
| | statistics_.push_back(&s); |
| | s.attach(*this); |
| | } |
| |
|
| | void PhraseInfoList::compute_statistics() |
| | { |
| | while(!statistics_.empty()) { |
| | statistics_.front()->compute_statistic(); |
| | statistics_.pop_front(); |
| | } |
| | } |
| |
|
| | |
| |
|
| | void MemoryPhraseTable::load_data(std::istream &instream) |
| | { |
| | Count total_count = 0; |
| |
|
| | Timestamp t_load; |
| | Count nlines = 1; |
| | String line; |
| | while(getline(instream, line)) { |
| | size_t sep1 = line.find(" ||| "); |
| | if(sep1 == line.npos) { |
| | std::cerr << "Phrase separator not found in: " << line << std::endl; |
| | abort(); |
| | } |
| | size_t sep2 = line.find(" ||| ", sep1 + 1); |
| | String s_src(line, 0, sep1); |
| | String s_tgt(line, sep1 + 5, sep2 - sep1 - 5); |
| | String s_alignment(line, sep2 + 5); |
| |
|
| | Phrase src = src_info_.index_phrase(s_src); |
| | Phrase tgt = tgt_info_.index_phrase(s_tgt); |
| | Count alignment = PhraseAlignment::index_alignment(src_info_[src].get_phrase().size(), tgt_info_[tgt].get_phrase().size(), s_alignment); |
| |
|
| | src_info_[src].inc_count(); |
| | tgt_info_[tgt].inc_count(); |
| | total_count++; |
| |
|
| | PhrasePair stpair(src, tgt); |
| | PhrasePairCounts::iterator it = joint_counts_.find(stpair); |
| |
|
| | if(it == joint_counts_.end()) { |
| | src_info_[src].inc_distinct(); |
| | tgt_info_[tgt].inc_distinct(); |
| | joint_counts_.insert(std::make_pair(stpair, PhrasePairInfo(src, tgt, alignment, 1).get_phrase_pair_data())); |
| | } else { |
| | PhrasePairInfo pi(src, tgt, it->second); |
| | pi.inc_count(); |
| | pi.add_alignment(alignment); |
| | it->second = pi.get_phrase_pair_data(); |
| | } |
| | if(nlines % 50000 == 0) |
| | std:: cerr << "Read " << nlines << " lines in " << (t_load.elapsed_time() / 1000) << " ms." << std::endl; |
| | nlines++; |
| | } |
| | } |
| |
|
| | void MemoryPhraseTable::attach_src_statistic(PhraseStatistic &s) |
| | { |
| | src_info_.attach_statistic(s); |
| | } |
| |
|
| | void MemoryPhraseTable::attach_tgt_statistic(PhraseStatistic &s) |
| | { |
| | tgt_info_.attach_statistic(s); |
| | } |
| |
|
| | void MemoryPhraseTable::compute_phrase_statistics() |
| | { |
| | src_info_.compute_statistics(); |
| | tgt_info_.compute_statistics(); |
| | } |
| |
|