|
#include "../../util/file_stream.hh" |
|
#include "../../util/file_piece.hh" |
|
#include "../../util/murmur_hash.hh" |
|
#include "../../util/pool.hh" |
|
#include "../../util/string_piece.hh" |
|
#include "../../util/string_piece_hash.hh" |
|
#include "../../util/tokenize_piece.hh" |
|
|
|
#include <boost/unordered_map.hpp> |
|
#include <boost/unordered_set.hpp> |
|
|
|
#include <cstddef> |
|
#include <vector> |
|
|
|
namespace { |
|
|
|
struct MutablePiece { |
|
mutable StringPiece behind; |
|
bool operator==(const MutablePiece &other) const { |
|
return behind == other.behind; |
|
} |
|
}; |
|
|
|
std::size_t hash_value(const MutablePiece &m) { |
|
return hash_value(m.behind); |
|
} |
|
|
|
class InternString { |
|
public: |
|
const char *Add(StringPiece str) { |
|
MutablePiece mut; |
|
mut.behind = str; |
|
std::pair<boost::unordered_set<MutablePiece>::iterator, bool> res(strs_.insert(mut)); |
|
if (res.second) { |
|
void *mem = backing_.Allocate(str.size() + 1); |
|
memcpy(mem, str.data(), str.size()); |
|
static_cast<char*>(mem)[str.size()] = 0; |
|
res.first->behind = StringPiece(static_cast<char*>(mem), str.size()); |
|
} |
|
return res.first->behind.data(); |
|
} |
|
|
|
private: |
|
util::Pool backing_; |
|
boost::unordered_set<MutablePiece> strs_; |
|
}; |
|
|
|
class TargetWords { |
|
public: |
|
void Introduce(StringPiece source) { |
|
vocab_.resize(vocab_.size() + 1); |
|
std::vector<unsigned int> temp(1, vocab_.size() - 1); |
|
Add(temp, source); |
|
} |
|
|
|
void Add(const std::vector<unsigned int> &sentences, StringPiece target) { |
|
if (sentences.empty()) return; |
|
interns_.clear(); |
|
for (util::TokenIter<util::SingleCharacter, true> i(target, ' '); i; ++i) { |
|
interns_.push_back(intern_.Add(*i)); |
|
} |
|
for (std::vector<unsigned int>::const_iterator i(sentences.begin()); i != sentences.end(); ++i) { |
|
boost::unordered_set<const char *> &vocab = vocab_[*i]; |
|
for (std::vector<const char *>::const_iterator j = interns_.begin(); j != interns_.end(); ++j) { |
|
vocab.insert(*j); |
|
} |
|
} |
|
} |
|
|
|
void Print() const { |
|
util::FileStream out(1); |
|
for (std::vector<boost::unordered_set<const char *> >::const_iterator i = vocab_.begin(); i != vocab_.end(); ++i) { |
|
for (boost::unordered_set<const char *>::const_iterator j = i->begin(); j != i->end(); ++j) { |
|
out << *j << ' '; |
|
} |
|
out << '\n'; |
|
} |
|
} |
|
|
|
private: |
|
InternString intern_; |
|
|
|
std::vector<boost::unordered_set<const char *> > vocab_; |
|
|
|
|
|
std::vector<const char *> interns_; |
|
}; |
|
|
|
class Input { |
|
public: |
|
explicit Input(std::size_t max_length) |
|
: max_length_(max_length), sentence_id_(0), empty_() {} |
|
|
|
void AddSentence(StringPiece sentence, TargetWords &targets) { |
|
canonical_.clear(); |
|
starts_.clear(); |
|
starts_.push_back(0); |
|
for (util::TokenIter<util::AnyCharacter, true> i(sentence, StringPiece("\0 \t", 3)); i; ++i) { |
|
canonical_.append(i->data(), i->size()); |
|
canonical_ += ' '; |
|
starts_.push_back(canonical_.size()); |
|
} |
|
targets.Introduce(canonical_); |
|
for (std::size_t i = 0; i < starts_.size() - 1; ++i) { |
|
std::size_t subtract = starts_[i]; |
|
const char *start = &canonical_[subtract]; |
|
for (std::size_t j = i + 1; j < std::min(starts_.size(), i + max_length_ + 1); ++j) { |
|
map_[util::MurmurHash64A(start, &canonical_[starts_[j]] - start - 1)].push_back(sentence_id_); |
|
} |
|
} |
|
++sentence_id_; |
|
} |
|
|
|
|
|
const std::vector<unsigned int> &Matches(StringPiece phrase) const { |
|
Map::const_iterator i = map_.find(util::MurmurHash64A(phrase.data(), phrase.size())); |
|
return i == map_.end() ? empty_ : i->second; |
|
} |
|
|
|
private: |
|
const std::size_t max_length_; |
|
|
|
|
|
typedef boost::unordered_map<uint64_t, std::vector<unsigned int> > Map; |
|
Map map_; |
|
|
|
std::size_t sentence_id_; |
|
|
|
|
|
std::string canonical_; |
|
std::vector<std::size_t> starts_; |
|
|
|
const std::vector<unsigned int> empty_; |
|
}; |
|
|
|
} |
|
|
|
int main(int argc, char *argv[]) { |
|
if (argc != 2) { |
|
std::cerr << "Expected source text on the command line" << std::endl; |
|
return 1; |
|
} |
|
Input input(7); |
|
TargetWords targets; |
|
try { |
|
util::FilePiece inputs(argv[1], &std::cerr); |
|
while (true) |
|
input.AddSentence(inputs.ReadLine(), targets); |
|
} catch (const util::EndOfFileException &e) {} |
|
|
|
util::FilePiece table(0, NULL, &std::cerr); |
|
StringPiece line; |
|
const StringPiece pipes("|||"); |
|
while (true) { |
|
try { |
|
line = table.ReadLine(); |
|
} catch (const util::EndOfFileException &e) { break; } |
|
util::TokenIter<util::MultiCharacter> it(line, pipes); |
|
StringPiece source(*it); |
|
if (!source.empty() && source[source.size() - 1] == ' ') |
|
source.remove_suffix(1); |
|
targets.Add(input.Matches(source), *++it); |
|
} |
|
targets.Print(); |
|
} |
|
|