|
#ifndef LM_FILTER_PHRASE_H |
|
#define LM_FILTER_PHRASE_H |
|
|
|
#include "../../util/murmur_hash.hh" |
|
#include "../../util/string_piece.hh" |
|
#include "../../util/tokenize_piece.hh" |
|
|
|
#include <boost/unordered_map.hpp> |
|
|
|
#include <iosfwd> |
|
#include <vector> |
|
|
|
#define LM_FILTER_PHRASE_METHOD(caps, lower) \ |
|
bool Find##caps(Hash key, const std::vector<unsigned int> *&out) const {\ |
|
Table::const_iterator i(table_.find(key));\ |
|
if (i==table_.end()) return false; \ |
|
out = &i->second.lower; \ |
|
return true; \ |
|
} |
|
|
|
namespace lm { |
|
namespace phrase { |
|
|
|
typedef uint64_t Hash; |
|
|
|
class Substrings { |
|
private: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct SentenceRelation { |
|
std::vector<unsigned int> substring, left, right, phrase; |
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
typedef boost::unordered_map<Hash, SentenceRelation> Table; |
|
|
|
public: |
|
Substrings() {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
LM_FILTER_PHRASE_METHOD(Substring, substring) |
|
LM_FILTER_PHRASE_METHOD(Left, left) |
|
LM_FILTER_PHRASE_METHOD(Right, right) |
|
LM_FILTER_PHRASE_METHOD(Phrase, phrase) |
|
|
|
#pragma GCC diagnostic ignored "-Wuninitialized" |
|
|
|
template <class Iterator> void AddPhrase(unsigned int sentence_id, const Iterator &begin, const Iterator &end) { |
|
|
|
for (Iterator start = begin; start != end; ++start) { |
|
Hash hash = 0; |
|
SentenceRelation *relation; |
|
for (Iterator finish = start; finish != end; ++finish) { |
|
hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *finish); |
|
|
|
relation = &table_[hash]; |
|
AppendSentence(relation->substring, sentence_id); |
|
if (start == begin) AppendSentence(relation->left, sentence_id); |
|
} |
|
AppendSentence(relation->right, sentence_id); |
|
if (start == begin) AppendSentence(relation->phrase, sentence_id); |
|
} |
|
} |
|
|
|
private: |
|
void AppendSentence(std::vector<unsigned int> &vec, unsigned int sentence_id) { |
|
if (vec.empty() || vec.back() != sentence_id) vec.push_back(sentence_id); |
|
} |
|
|
|
Table table_; |
|
}; |
|
|
|
|
|
|
|
unsigned int ReadMultiple(std::istream &in, Substrings &out); |
|
|
|
namespace detail { |
|
extern const StringPiece kEndSentence; |
|
|
|
template <class Iterator> void MakeHashes(Iterator i, const Iterator &end, std::vector<Hash> &hashes) { |
|
hashes.clear(); |
|
if (i == end) return; |
|
|
|
if ((i->data()[0] == '<') && (i->data()[i->size() - 1] == '>')) { |
|
++i; |
|
} |
|
for (; i != end && (*i != kEndSentence); ++i) { |
|
hashes.push_back(util::MurmurHashNative(i->data(), i->size())); |
|
} |
|
} |
|
|
|
class Vertex; |
|
class Arc; |
|
|
|
class ConditionCommon { |
|
protected: |
|
ConditionCommon(const Substrings &substrings); |
|
ConditionCommon(const ConditionCommon &from); |
|
|
|
~ConditionCommon(); |
|
|
|
detail::Vertex &MakeGraph(); |
|
|
|
|
|
std::vector<Hash> hashes_; |
|
|
|
private: |
|
std::vector<detail::Vertex> vertices_; |
|
std::vector<detail::Arc> arcs_; |
|
|
|
const Substrings &substrings_; |
|
}; |
|
|
|
} |
|
|
|
class Union : public detail::ConditionCommon { |
|
public: |
|
explicit Union(const Substrings &substrings) : detail::ConditionCommon(substrings) {} |
|
|
|
template <class Iterator> bool PassNGram(const Iterator &begin, const Iterator &end) { |
|
detail::MakeHashes(begin, end, hashes_); |
|
return hashes_.empty() || Evaluate(); |
|
} |
|
|
|
private: |
|
bool Evaluate(); |
|
}; |
|
|
|
class Multiple : public detail::ConditionCommon { |
|
public: |
|
explicit Multiple(const Substrings &substrings) : detail::ConditionCommon(substrings) {} |
|
|
|
template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) { |
|
detail::MakeHashes(begin, end, hashes_); |
|
if (hashes_.empty()) { |
|
output.AddNGram(line); |
|
} else { |
|
Evaluate(line, output); |
|
} |
|
} |
|
|
|
template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { |
|
AddNGram(util::TokenIter<util::SingleCharacter, true>(ngram, ' '), util::TokenIter<util::SingleCharacter, true>::end(), line, output); |
|
} |
|
|
|
void Flush() const {} |
|
|
|
private: |
|
template <class Output> void Evaluate(const StringPiece &line, Output &output); |
|
}; |
|
|
|
} |
|
} |
|
#endif |
|
|