#pragma once #include #include #include #include #include #include #include #include #include //mkdir #include "hash.h" //Includes line_splitter #include "probing_hash_utils.h" #include "vocabid.h" #include "util/file_piece.hh" #include "util/file.hh" namespace probingpt { typedef std::vector SourcePhrase; class Node { typedef boost::unordered_map Children; Children m_children; public: uint64_t key; bool done; Node() :done(false) {} void Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos = 0); void Write(Table &table); }; void createProbingPT(const std::string &phrasetable_path, const std::string &basepath, int num_scores, int num_lex_scores, bool log_prob, int max_cache_size, bool scfg); uint64_t getKey(const std::vector &source_phrase); std::vector CreatePrefix(const std::vector &vocabid_source, size_t endPos); template std::string Debug(const std::vector &vec) { std::stringstream strm; for (size_t i = 0; i < vec.size(); ++i) { strm << vec[i] << " "; } return strm.str(); } size_t countUniqueSource(const std::string &path); class CacheItem { public: std::string source; uint64_t sourceKey; float count; CacheItem(const std::string &vSource, uint64_t vSourceKey, float vCount) :source(vSource) ,sourceKey(vSourceKey) ,count(vCount) { } bool operator<(const CacheItem &other) const { return count > other.count; } }; class CacheItemOrderer { public: bool operator()(const CacheItem* a, const CacheItem* b) const { return (*a) < (*b); } }; void serialize_cache( std::priority_queue, CacheItemOrderer> &cache, const std::string &path, float totalSourceCount); }