File size: 6,515 Bytes
1ce325b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
#ifndef UTIL_FILE_PIECE_H
#define UTIL_FILE_PIECE_H
#include "ersatz_progress.hh"
#include "exception.hh"
#include "file.hh"
#include "mmap.hh"
#include "read_compressed.hh"
#include "spaces.hh"
#include "string_piece.hh"
#include <cstddef>
#include <iosfwd>
#include <string>
#include <cassert>
#include <stdint.h>
namespace util {
class ParseNumberException : public Exception {
public:
explicit ParseNumberException(StringPiece value) throw();
~ParseNumberException() throw() {}
};
class FilePiece;
// Input Iterator over lines. This allows
// for (StringPiece l : FilePiece("file"))
// in C++11.
// NB: not multipass.
class LineIterator {
public:
LineIterator() : backing_(NULL) {}
explicit LineIterator(FilePiece &f, char delim = '\n') : backing_(&f), delim_(delim) {
++*this;
}
LineIterator &operator++();
bool operator==(const LineIterator &other) const {
return backing_ == other.backing_;
}
bool operator!=(const LineIterator &other) const {
return backing_ != other.backing_;
}
operator bool() const { return backing_ != NULL; }
StringPiece operator*() const { return line_; }
const StringPiece *operator->() const { return &line_; }
private:
FilePiece *backing_;
StringPiece line_;
char delim_;
};
// Memory backing the returned StringPiece may vanish on the next call.
class FilePiece {
public:
// 1 MB default.
explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
// Takes ownership of fd. name is used for messages.
explicit FilePiece(int fd, const char *name = NULL, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
/* Read from an istream. Don't use this if you can avoid it. Raw fd IO is
* much faster. But sometimes you just have an istream like Boost's HTTP
* server and want to parse it the same way.
* name is just used for messages and FileName().
*/
explicit FilePiece(std::istream &stream, const char *name = NULL, std::size_t min_buffer = 1048576);
LineIterator begin() {
return LineIterator(*this);
}
LineIterator end() {
return LineIterator();
}
char peek() {
if (position_ == position_end_) {
Shift();
if (at_end_) throw EndOfFileException();
}
return *position_;
}
char get() {
char ret = peek();
++position_;
return ret;
}
// Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace().
StringPiece ReadDelimited(const bool *delim = kSpaces) {
SkipSpaces(delim);
return Consume(FindDelimiterOrEOF(delim));
}
/// Read word until the line or file ends.
bool ReadWordSameLine(StringPiece &to, const bool *delim = kSpaces) {
assert(delim[static_cast<unsigned char>('\n')]);
// Skip non-enter spaces.
for (; ; ++position_) {
if (position_ == position_end_) {
try {
Shift();
} catch (const util::EndOfFileException &) { return false; }
// And break out at end of file.
if (position_ == position_end_) return false;
}
if (!delim[static_cast<unsigned char>(*position_)]) break;
if (*position_ == '\n') return false;
}
// We can't be at the end of file because there's at least one character open.
to = Consume(FindDelimiterOrEOF(delim));
return true;
}
/** Read a line of text from the file.
*
* Unlike ReadDelimited, this includes leading spaces and consumes the
* delimiter. It is similar to getline in that way.
*
* If strip_cr is true, any trailing carriate return (as would be found on
* a file written on Windows) will be left out of the returned line.
*
* Throws EndOfFileException if the end of the file is encountered. If the
* file does not end in a newline, this could mean that the last line is
* never read.
*/
StringPiece ReadLine(char delim = '\n', bool strip_cr = true);
/** Read a line of text from the file, or return false on EOF.
*
* This is like ReadLine, except it returns false where ReadLine throws
* EndOfFileException. Like ReadLine it may not read the last line in the
* file if the file does not end in a newline.
*
* If strip_cr is true, any trailing carriate return (as would be found on
* a file written on Windows) will be left out of the returned line.
*/
bool ReadLineOrEOF(StringPiece &to, char delim = '\n', bool strip_cr = true);
float ReadFloat();
double ReadDouble();
long int ReadLong();
unsigned long int ReadULong();
// Skip spaces defined by isspace.
void SkipSpaces(const bool *delim = kSpaces) {
assert(position_ <= position_end_);
for (; ; ++position_) {
if (position_ == position_end_) {
Shift();
// And break out at end of file.
if (position_ == position_end_) return;
}
assert(position_ < position_end_);
if (!delim[static_cast<unsigned char>(*position_)]) return;
}
}
uint64_t Offset() const {
return position_ - data_.begin() + mapped_offset_;
}
const std::string &FileName() const { return file_name_; }
// Force a progress update.
void UpdateProgress();
private:
void InitializeNoRead(const char *name, std::size_t min_buffer);
// Calls InitializeNoRead, so don't call both.
void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer);
template <class T> T ReadNumber();
StringPiece Consume(const char *to) {
assert(to >= position_);
StringPiece ret(position_, to - position_);
position_ = to;
return ret;
}
const char *FindDelimiterOrEOF(const bool *delim = kSpaces);
void Shift();
// Backends to Shift().
void MMapShift(uint64_t desired_begin);
void TransitionToRead();
void ReadShift();
const char *position_, *last_space_, *position_end_;
scoped_fd file_;
const uint64_t total_size_;
std::size_t default_map_size_;
uint64_t mapped_offset_;
// Order matters: file_ should always be destroyed after this.
scoped_memory data_;
bool at_end_;
bool fallback_to_read_;
ErsatzProgress progress_;
std::string file_name_;
ReadCompressed fell_back_;
};
} // namespace util
#endif // UTIL_FILE_PIECE_H
|