File size: 1,952 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#pragma once

#include <boost/unordered_set.hpp>
#include <boost/unordered_map.hpp>
#include <cstdio>
#include <sstream>
#include <fstream>
#include <iostream>
#include <string>
#include <queue>
#include <sys/stat.h> //mkdir

#include "hash.h" //Includes line_splitter
#include "probing_hash_utils.h"
#include "vocabid.h"

#include "util/file_piece.hh"
#include "util/file.hh"

namespace probingpt
{
typedef std::vector<uint64_t> SourcePhrase;


class Node
{
  typedef boost::unordered_map<uint64_t, Node> Children;
  Children m_children;

public:
  uint64_t key;
  bool done;

  Node()
    :done(false)
  {}

  void Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos = 0);
  void Write(Table &table);
};


void createProbingPT(const std::string &phrasetable_path,
                     const std::string &basepath, int num_scores, int num_lex_scores,
                     bool log_prob, int max_cache_size, bool scfg);
uint64_t getKey(const std::vector<uint64_t> &source_phrase);

std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos);

template<typename T>
std::string Debug(const std::vector<T> &vec)
{
  std::stringstream strm;
  for (size_t i = 0; i < vec.size(); ++i) {
    strm << vec[i] << " ";
  }
  return strm.str();
}

size_t countUniqueSource(const std::string &path);

class CacheItem
{
public:
  std::string source;
  uint64_t sourceKey;
  float count;
  CacheItem(const std::string &vSource, uint64_t vSourceKey, float vCount)
    :source(vSource)
    ,sourceKey(vSourceKey)
    ,count(vCount) {
  }

  bool operator<(const CacheItem &other) const {
    return count > other.count;
  }
};

class CacheItemOrderer
{
public:
  bool operator()(const CacheItem* a, const CacheItem* b) const {
    return (*a) < (*b);
  }
};

void serialize_cache(
  std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
  const std::string &path, float totalSourceCount);

}