|
#pragma once |
|
|
|
#include <boost/unordered_set.hpp> |
|
#include <boost/unordered_map.hpp> |
|
#include <cstdio> |
|
#include <sstream> |
|
#include <fstream> |
|
#include <iostream> |
|
#include <string> |
|
#include <queue> |
|
#include <sys/stat.h> |
|
|
|
#include "hash.h" |
|
#include "probing_hash_utils.h" |
|
#include "vocabid.h" |
|
|
|
#include "util/file_piece.hh" |
|
#include "util/file.hh" |
|
|
|
namespace probingpt |
|
{ |
|
typedef std::vector<uint64_t> SourcePhrase; |
|
|
|
|
|
class Node |
|
{ |
|
typedef boost::unordered_map<uint64_t, Node> Children; |
|
Children m_children; |
|
|
|
public: |
|
uint64_t key; |
|
bool done; |
|
|
|
Node() |
|
:done(false) |
|
{} |
|
|
|
void Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos = 0); |
|
void Write(Table &table); |
|
}; |
|
|
|
|
|
void createProbingPT(const std::string &phrasetable_path, |
|
const std::string &basepath, int num_scores, int num_lex_scores, |
|
bool log_prob, int max_cache_size, bool scfg); |
|
uint64_t getKey(const std::vector<uint64_t> &source_phrase); |
|
|
|
std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos); |
|
|
|
template<typename T> |
|
std::string Debug(const std::vector<T> &vec) |
|
{ |
|
std::stringstream strm; |
|
for (size_t i = 0; i < vec.size(); ++i) { |
|
strm << vec[i] << " "; |
|
} |
|
return strm.str(); |
|
} |
|
|
|
size_t countUniqueSource(const std::string &path); |
|
|
|
class CacheItem |
|
{ |
|
public: |
|
std::string source; |
|
uint64_t sourceKey; |
|
float count; |
|
CacheItem(const std::string &vSource, uint64_t vSourceKey, float vCount) |
|
:source(vSource) |
|
,sourceKey(vSourceKey) |
|
,count(vCount) { |
|
} |
|
|
|
bool operator<(const CacheItem &other) const { |
|
return count > other.count; |
|
} |
|
}; |
|
|
|
class CacheItemOrderer |
|
{ |
|
public: |
|
bool operator()(const CacheItem* a, const CacheItem* b) const { |
|
return (*a) < (*b); |
|
} |
|
}; |
|
|
|
void serialize_cache( |
|
std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache, |
|
const std::string &path, float totalSourceCount); |
|
|
|
} |
|
|
|
|