File size: 1,209 Bytes
1ce325b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#include "vocab.hh"

#include <istream>
#include <iostream>

#include <cctype>

namespace lm {
namespace vocab {

void ReadSingle(std::istream &in, boost::unordered_set<std::string> &out) {
  in.exceptions(std::istream::badbit);
  std::string word;
  while (in >> word) {
    out.insert(word);
  }
}

namespace {
bool IsLineEnd(std::istream &in) {
  int got;
  do {
    got = in.get();
    if (!in) return true;
    if (got == '\n') return true;
  } while (isspace(got));
  in.unget();
  return false;
}
}// namespace

// Read space separated words in enter separated lines.  These lines can be
// very long, so don't read an entire line at a time.
unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) {
  in.exceptions(std::istream::badbit);
  unsigned int sentence = 0;
  bool used_id = false;
  std::string word;
  while (in >> word) {
    used_id = true;
    std::vector<unsigned int> &posting = out[word];
    if (posting.empty() || (posting.back() != sentence))
      posting.push_back(sentence);
    if (IsLineEnd(in)) {
      ++sentence;
      used_id = false;
    }
  }
  return sentence + used_id;
}

} // namespace vocab
} // namespace lm