|
|
|
|
|
#ifndef LM_TRIE_SORT_H |
|
#define LM_TRIE_SORT_H |
|
|
|
#include "max_order.hh" |
|
#include "word_index.hh" |
|
|
|
#include "../util/file.hh" |
|
#include "../util/scoped.hh" |
|
|
|
#include <cstddef> |
|
#include <functional> |
|
#include <string> |
|
#include <vector> |
|
|
|
#include <stdint.h> |
|
|
|
namespace util { |
|
class FilePiece; |
|
} |
|
|
|
namespace lm { |
|
class PositiveProbWarn; |
|
namespace ngram { |
|
class SortedVocabulary; |
|
struct Config; |
|
|
|
namespace trie { |
|
|
|
class EntryCompare : public std::binary_function<const void*, const void*, bool> { |
|
public: |
|
explicit EntryCompare(unsigned char order) : order_(order) {} |
|
|
|
bool operator()(const void *first_void, const void *second_void) const { |
|
const WordIndex *first = static_cast<const WordIndex*>(first_void); |
|
const WordIndex *second = static_cast<const WordIndex*>(second_void); |
|
const WordIndex *end = first + order_; |
|
for (; first != end; ++first, ++second) { |
|
if (*first < *second) return true; |
|
if (*first > *second) return false; |
|
} |
|
return false; |
|
} |
|
private: |
|
unsigned char order_; |
|
}; |
|
|
|
class RecordReader { |
|
public: |
|
RecordReader() : remains_(true) {} |
|
|
|
void Init(FILE *file, std::size_t entry_size); |
|
|
|
void *Data() { return data_.get(); } |
|
const void *Data() const { return data_.get(); } |
|
|
|
RecordReader &operator++() { |
|
std::size_t ret = fread(data_.get(), entry_size_, 1, file_); |
|
if (!ret) { |
|
UTIL_THROW_IF(!feof(file_), util::ErrnoException, "Error reading temporary file"); |
|
remains_ = false; |
|
} |
|
return *this; |
|
} |
|
|
|
operator bool() const { return remains_; } |
|
|
|
void Rewind(); |
|
|
|
std::size_t EntrySize() const { return entry_size_; } |
|
|
|
void Overwrite(const void *start, std::size_t amount); |
|
|
|
private: |
|
FILE *file_; |
|
|
|
util::scoped_malloc data_; |
|
|
|
bool remains_; |
|
|
|
std::size_t entry_size_; |
|
}; |
|
|
|
class SortedFiles { |
|
public: |
|
|
|
SortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, std::size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab); |
|
|
|
int StealUnigram() { |
|
return unigram_.release(); |
|
} |
|
|
|
FILE *Full(unsigned char order) { |
|
return full_[order - 2].get(); |
|
} |
|
|
|
FILE *Context(unsigned char of_order) { |
|
return context_[of_order - 2].get(); |
|
} |
|
|
|
private: |
|
void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const std::string &prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size); |
|
|
|
util::scoped_fd unigram_; |
|
|
|
util::scoped_FILE full_[KENLM_MAX_ORDER - 1], context_[KENLM_MAX_ORDER - 1]; |
|
}; |
|
|
|
} |
|
} |
|
} |
|
|
|
#endif |
|
|