File size: 3,009 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
/*
 * HypPackCollection.h
 * kbmira - k-best Batch MIRA
 *
 * Abstracts away the mess of iterating through multiple
 * collections of k-best lists, as well as deduping
 */

#ifndef MERT_HYP_PACK_COLLECTION_H
#define MERT_HYP_PACK_COLLECTION_H

#include <string>
#include <vector>
#include <utility>
#include <cstddef>

#include "FeatureDataIterator.h"
#include "ScoreDataIterator.h"
#include "MiraFeatureVector.h"

namespace MosesTuning
{


// Start with these abstract classes

class HypPackEnumerator
{
public:
  virtual ~HypPackEnumerator() {}

  virtual void reset() = 0;
  virtual bool finished() = 0;
  virtual void next() = 0;

  virtual std::size_t cur_id() = 0;
  virtual std::size_t cur_size() = 0;
  virtual std::size_t num_dense() const = 0;
  virtual const MiraFeatureVector& featuresAt(std::size_t i) = 0;
  virtual const ScoreDataItem& scoresAt(std::size_t i) = 0;
};

// Instantiation that streams from disk
// Low-memory, low-speed, sequential access
class StreamingHypPackEnumerator : public HypPackEnumerator
{
public:
  StreamingHypPackEnumerator(std::vector<std::string> const& featureFiles,
                             std::vector<std::string> const& scoreFiles);

  virtual std::size_t num_dense() const;

  virtual void reset();
  virtual bool finished();
  virtual void next();

  virtual std::size_t cur_id();
  virtual std::size_t cur_size();
  virtual const MiraFeatureVector& featuresAt(std::size_t i);
  virtual const ScoreDataItem& scoresAt(std::size_t i);

private:
  void prime();
  std::size_t m_num_lists;
  std::size_t m_sentenceId;
  std::vector<std::string> m_featureFiles;
  std::vector<std::string> m_scoreFiles;

  bool m_primed;
  int m_iNumDense;
  std::vector<FeatureDataIterator>  m_featureDataIters;
  std::vector<ScoreDataIterator>    m_scoreDataIters;
  std::vector<std::pair<std::size_t,std::size_t> > m_current_indexes;
  std::vector<MiraFeatureVector>    m_current_featureVectors;
};

// Instantiation that reads into memory
// High-memory, high-speed, random access
// (Actually randomizes with each call to reset)
class RandomAccessHypPackEnumerator : public HypPackEnumerator
{
public:
  RandomAccessHypPackEnumerator(std::vector<std::string> const& featureFiles,
                                std::vector<std::string> const& scoreFiles,
                                bool no_shuffle);

  virtual std::size_t num_dense() const;

  virtual void reset();
  virtual bool finished();
  virtual void next();

  virtual std::size_t cur_id();
  virtual std::size_t cur_size();
  virtual const MiraFeatureVector& featuresAt(std::size_t i);
  virtual const ScoreDataItem& scoresAt(std::size_t i);

private:
  bool m_no_shuffle;
  std::size_t m_cur_index;
  std::size_t m_num_dense;
  std::vector<std::size_t> m_indexes;
  std::vector<std::vector<MiraFeatureVector> > m_features;
  std::vector<std::vector<ScoreDataItem> > m_scores;
};

}

#endif // MERT_HYP_PACK_COLLECTION_H

// --Emacs trickery--
// Local Variables:
// mode:c++
// c-basic-offset:2
// End: