|
#ifndef LM_FILTER_VOCAB_H |
|
#define LM_FILTER_VOCAB_H |
|
|
|
|
|
|
|
#include "../../util/multi_intersection.hh" |
|
#include "../../util/string_piece.hh" |
|
#include "../../util/string_piece_hash.hh" |
|
#include "../../util/tokenize_piece.hh" |
|
|
|
#include <boost/noncopyable.hpp> |
|
#include <boost/range/iterator_range.hpp> |
|
#include <boost/unordered/unordered_map.hpp> |
|
#include <boost/unordered/unordered_set.hpp> |
|
|
|
#include <string> |
|
#include <vector> |
|
|
|
namespace lm { |
|
namespace vocab { |
|
|
|
void ReadSingle(std::istream &in, boost::unordered_set<std::string> &out); |
|
|
|
|
|
unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out); |
|
|
|
|
|
|
|
|
|
|
|
inline bool IsTag(const StringPiece &value) { |
|
|
|
assert(!value.empty()); |
|
return (value.data()[0] == '<' && value.data()[value.size() - 1] == '>'); |
|
} |
|
|
|
class Single { |
|
public: |
|
typedef boost::unordered_set<std::string> Words; |
|
|
|
explicit Single(const Words &vocab) : vocab_(vocab) {} |
|
|
|
template <class Iterator> bool PassNGram(const Iterator &begin, const Iterator &end) { |
|
for (Iterator i = begin; i != end; ++i) { |
|
if (IsTag(*i)) continue; |
|
if (FindStringPiece(vocab_, *i) == vocab_.end()) return false; |
|
} |
|
return true; |
|
} |
|
|
|
private: |
|
const Words &vocab_; |
|
}; |
|
|
|
class Union { |
|
public: |
|
typedef boost::unordered_map<std::string, std::vector<unsigned int> > Words; |
|
|
|
explicit Union(const Words &vocabs) : vocabs_(vocabs) {} |
|
|
|
template <class Iterator> bool PassNGram(const Iterator &begin, const Iterator &end) { |
|
sets_.clear(); |
|
|
|
for (Iterator i(begin); i != end; ++i) { |
|
if (IsTag(*i)) continue; |
|
Words::const_iterator found(FindStringPiece(vocabs_, *i)); |
|
if (vocabs_.end() == found) return false; |
|
sets_.push_back(boost::iterator_range<const unsigned int*>(&*found->second.begin(), &*found->second.end())); |
|
} |
|
return (sets_.empty() || util::FirstIntersection(sets_)); |
|
} |
|
|
|
private: |
|
const Words &vocabs_; |
|
|
|
std::vector<boost::iterator_range<const unsigned int*> > sets_; |
|
}; |
|
|
|
class Multiple { |
|
public: |
|
typedef boost::unordered_map<std::string, std::vector<unsigned int> > Words; |
|
|
|
Multiple(const Words &vocabs) : vocabs_(vocabs) {} |
|
|
|
private: |
|
|
|
template <class Output> class Callback { |
|
public: |
|
Callback(Output &out, const StringPiece &line) : out_(out), line_(line) {} |
|
|
|
void operator()(unsigned int index) { |
|
out_.SingleAddNGram(index, line_); |
|
} |
|
|
|
private: |
|
Output &out_; |
|
const StringPiece &line_; |
|
}; |
|
|
|
public: |
|
template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) { |
|
sets_.clear(); |
|
for (Iterator i(begin); i != end; ++i) { |
|
if (IsTag(*i)) continue; |
|
Words::const_iterator found(FindStringPiece(vocabs_, *i)); |
|
if (vocabs_.end() == found) return; |
|
sets_.push_back(boost::iterator_range<const unsigned int*>(&*found->second.begin(), &*found->second.end())); |
|
} |
|
if (sets_.empty()) { |
|
output.AddNGram(line); |
|
return; |
|
} |
|
|
|
Callback<Output> cb(output, line); |
|
util::AllIntersection(sets_, cb); |
|
} |
|
|
|
template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { |
|
AddNGram(util::TokenIter<util::SingleCharacter, true>(ngram, ' '), util::TokenIter<util::SingleCharacter, true>::end(), line, output); |
|
} |
|
|
|
void Flush() const {} |
|
|
|
private: |
|
const Words &vocabs_; |
|
|
|
std::vector<boost::iterator_range<const unsigned int*> > sets_; |
|
}; |
|
|
|
} |
|
} |
|
|
|
#endif |
|
|