/* Memory mapping wrappers. * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at * NICT. */ #include "mmap.hh" #include "exception.hh" #include "file.hh" #include "scoped.hh" #include #include #include #include #include #include #if defined(_WIN32) || defined(_WIN64) #include #include #else #include #include #endif namespace util { std::size_t SizePage() { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; GetSystemInfo(&si); return si.dwAllocationGranularity; #else return sysconf(_SC_PAGE_SIZE); #endif } scoped_mmap::~scoped_mmap() { if (data_ != (void*)-1) { try { // Thanks Denis Filimonov for pointing out NFS likes msync first. SyncOrThrow(data_, size_); UnmapOrThrow(data_, size_); } catch (const util::ErrnoException &e) { std::cerr << e.what(); abort(); } } } namespace { template T RoundUpPow2(T value, T mult) { return ((value - 1) & ~(mult - 1)) + mult; } std::size_t RoundUpSize(const scoped_memory &mem) { switch(mem.source()) { case scoped_memory::MMAP_ROUND_1G_ALLOCATED: return RoundUpPow2(mem.size(), 1ULL << 30); case scoped_memory::MMAP_ROUND_2M_ALLOCATED: return RoundUpPow2(mem.size(), 1ULL << 21); case scoped_memory::MMAP_ROUND_PAGE_ALLOCATED: return RoundUpPow2(mem.size(), static_cast(SizePage())); default: return mem.size(); } } } // namespace scoped_memory::scoped_memory(std::size_t size, bool zeroed) : data_(NULL), size_(0), source_(NONE_ALLOCATED) { HugeMalloc(size, zeroed, *this); } void scoped_memory::reset(void *data, std::size_t size, Alloc source) { switch(source_) { case MMAP_ROUND_1G_ALLOCATED: case MMAP_ROUND_2M_ALLOCATED: case MMAP_ROUND_PAGE_ALLOCATED: case MMAP_ALLOCATED: scoped_mmap(data_, RoundUpSize(*this)); break; case MALLOC_ALLOCATED: free(data_); break; case NONE_ALLOCATED: break; } data_ = data; size_ = size; source_ = source; } const int kFileFlags = #if defined(_WIN32) || defined(_WIN64) 0 // MapOrThrow ignores flags on windows #elif defined(MAP_FILE) MAP_FILE | MAP_SHARED #else MAP_SHARED #endif ; void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset) { #ifdef MAP_POPULATE // Linux specific if (prefault) { flags |= MAP_POPULATE; } #endif #if defined(_WIN32) || defined(_WIN64) int protectC = for_write ? PAGE_READWRITE : PAGE_READONLY; int protectM = for_write ? FILE_MAP_WRITE : FILE_MAP_READ; uint64_t total_size = size + offset; HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, total_size >> 32, static_cast(total_size), NULL); UTIL_THROW_IF(!hMapping, ErrnoException, "CreateFileMapping failed"); LPVOID ret = MapViewOfFile(hMapping, protectM, offset >> 32, offset, size); CloseHandle(hMapping); UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed"); #else int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ; void *ret; UTIL_THROW_IF((ret = mmap(NULL, size, protect, flags, fd, offset)) == MAP_FAILED, ErrnoException, "mmap failed for size " << size << " at offset " << offset); # ifdef MADV_HUGEPAGE /* We like huge pages but it's fine if we can't have them. Note that huge * pages are not supported for file-backed mmap on linux. */ madvise(ret, size, MADV_HUGEPAGE); # endif #endif return ret; } void SyncOrThrow(void *start, size_t length) { #if defined(_WIN32) || defined(_WIN64) UTIL_THROW_IF(!::FlushViewOfFile(start, length), ErrnoException, "Failed to sync mmap"); #else UTIL_THROW_IF(length && msync(start, length, MS_SYNC), ErrnoException, "Failed to sync mmap"); #endif } void UnmapOrThrow(void *start, size_t length) { #if defined(_WIN32) || defined(_WIN64) UTIL_THROW_IF(!::UnmapViewOfFile(start), ErrnoException, "Failed to unmap a file"); #else UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed with " << start << " for length " << length); #endif } // Linux huge pages. #ifdef __linux__ namespace { bool TryHuge(std::size_t size, bool populate, uint8_t alignment_bits, scoped_memory::Alloc huge_scheme, scoped_memory &to) { // Don't bother with these cases. if (size < (1ULL << alignment_bits) || (1ULL << alignment_bits) < SizePage()) return false; // First try: Linux >= 3.8 with manually configured hugetlb pages available. int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | (alignment_bits << 26 /* This is MAP_HUGE_SHIFT but some headers are too old. */); if (populate) flags |= MAP_POPULATE; void *ret = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); if (ret != MAP_FAILED) { to.reset(ret, size, huge_scheme); return true; } // There weren't pages in a sysadmin-created pool. Let's get aligned memory // and hope transparent huge pages kicks in. Align to a multiple of the huge // page size by overallocating. I feel bad about doing this, but it's also how // posix_memalign is implemented. And the memory is virtual. // Round up requested size to multiple of page size. This will allow the pages after to be munmapped. std::size_t size_up = RoundUpPow2(size, SizePage()); std::size_t ask = size_up + (1 << alignment_bits) - SizePage(); // Don't populate because this is asking for more than we will use. scoped_mmap larger(mmap(NULL, ask, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0), ask); if (larger.get() == MAP_FAILED) return false; // Throw out pages before the alignment point. uintptr_t base = reinterpret_cast(larger.get()); // Round up to next multiple of alignment. uintptr_t rounded_up = RoundUpPow2(base, static_cast(1) << alignment_bits); if (base != rounded_up) { // If this throws an exception (which it shouldn't) then we want to unmap the whole thing by keeping it in larger. UnmapOrThrow(larger.get(), rounded_up - base); larger.steal(); larger.reset(reinterpret_cast(rounded_up), ask - (rounded_up - base)); } // Throw out pages after the requested size. assert(larger.size() >= size_up); if (larger.size() > size_up) { // This is where we assume size_up is a multiple of page size. UnmapOrThrow(static_cast(larger.get()) + size_up, larger.size() - size_up); larger.reset(larger.steal(), size_up); } #ifdef MADV_HUGEPAGE madvise(larger.get(), size_up, MADV_HUGEPAGE); #endif to.reset(larger.steal(), size, scoped_memory::MMAP_ROUND_PAGE_ALLOCATED); return true; } } // namespace #endif void HugeMalloc(std::size_t size, bool zeroed, scoped_memory &to) { to.reset(); #ifdef __linux__ // TODO: architectures/page sizes other than 2^21 and 2^30. // Attempt 1 GB pages. // If the user asked for zeroed memory, assume they want it populated. if (size >= (1ULL << 30) && TryHuge(size, zeroed, 30, scoped_memory::MMAP_ROUND_1G_ALLOCATED, to)) return; // Attempt 2 MB pages. if (size >= (1ULL << 21) && TryHuge(size, zeroed, 21, scoped_memory::MMAP_ROUND_2M_ALLOCATED, to)) return; #endif // __linux__ // Non-linux will always do this, as will small allocations on Linux. to.reset(zeroed ? calloc(1, size) : malloc(size), size, scoped_memory::MALLOC_ALLOCATED); UTIL_THROW_IF(!to.get(), ErrnoException, "Failed to allocate " << size << " bytes"); } namespace { #ifdef __linux__ const std::size_t kTransitionHuge = std::max(1ULL << 21, SizePage()); #endif // __linux__ void ReplaceAndCopy(std::size_t to, bool zero_new, scoped_memory &mem) { scoped_memory replacement; HugeMalloc(to, zero_new, replacement); memcpy(replacement.get(), mem.get(), mem.size()); // This can't throw. mem.reset(replacement.get(), replacement.size(), replacement.source()); replacement.steal(); } } // namespace void HugeRealloc(std::size_t to, bool zero_new, scoped_memory &mem) { if (!to) { mem.reset(); return; } switch (mem.source()) { case scoped_memory::NONE_ALLOCATED: HugeMalloc(to, zero_new, mem); return; #ifdef __linux__ // TODO really need to collapse these cases with a number. case scoped_memory::MMAP_ROUND_1G_ALLOCATED: case scoped_memory::MMAP_ROUND_2M_ALLOCATED: case scoped_memory::MMAP_ROUND_PAGE_ALLOCATED: case scoped_memory::MMAP_ALLOCATED: // Downsizing below barrier? if (to <= SizePage()) { scoped_malloc replacement(malloc(to)); memcpy(replacement.get(), mem.get(), std::min(to, mem.size())); if (zero_new && to > mem.size()) memset(static_cast(replacement.get()) + mem.size(), 0, to - mem.size()); mem.reset(replacement.release(), to, scoped_memory::MALLOC_ALLOCATED); } else { // main path: try to mremap. void *new_addr = mremap(mem.get(), RoundUpSize(mem), to, MREMAP_MAYMOVE); if (new_addr != MAP_FAILED) { scoped_memory::Alloc source(mem.source()); // steal resets mem.source() mem.steal(); // let go otherwise reset() will free it first mem.reset(new_addr, to, source); } else { // Reallocating huge pages can fail with EINVAL. // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/mremap.c?id=refs/tags/v3.19#n346 ReplaceAndCopy(to, zero_new, mem); } } return; #endif // __linux__ case scoped_memory::MALLOC_ALLOCATED: #ifdef __linux__ // Transition larger allocations to huge pages, but don't keep trying if we're still malloc allocated. if (to >= kTransitionHuge && mem.size() < kTransitionHuge) { ReplaceAndCopy(to, zero_new, mem); return; } #endif // __linux__ { void *new_addr = std::realloc(mem.get(), to); UTIL_THROW_IF(!new_addr, ErrnoException, "realloc to " << to << " bytes failed."); if (zero_new && to > mem.size()) memset(static_cast(new_addr) + mem.size(), 0, to - mem.size()); mem.steal(); mem.reset(new_addr, to, scoped_memory::MALLOC_ALLOCATED); } return; default: UTIL_THROW(Exception, "HugeRealloc called with type " << mem.source()); } } void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out) { switch (method) { case LAZY: out.reset(MapOrThrow(size, false, kFileFlags, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED); break; case POPULATE_OR_LAZY: #ifdef MAP_POPULATE case POPULATE_OR_READ: #endif out.reset(MapOrThrow(size, false, kFileFlags, true, fd, offset), size, scoped_memory::MMAP_ALLOCATED); break; #ifndef MAP_POPULATE case POPULATE_OR_READ: #endif case READ: HugeMalloc(size, false, out); SeekOrThrow(fd, offset); ReadOrThrow(fd, out.get(), size); break; case PARALLEL_READ: UTIL_THROW(Exception, "Parallel read was removed from this repo."); break; } } void *MapZeroedWrite(int fd, std::size_t size) { ResizeOrThrow(fd, 0); ResizeOrThrow(fd, size); return MapOrThrow(size, true, kFileFlags, false, fd, 0); } void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) { file.reset(CreateOrThrow(name)); try { return MapZeroedWrite(file.get(), size); } catch (ErrnoException &e) { e << " in file " << name; throw; } } Rolling::Rolling(const Rolling ©_from, uint64_t increase) { *this = copy_from; IncreaseBase(increase); } Rolling &Rolling::operator=(const Rolling ©_from) { fd_ = copy_from.fd_; file_begin_ = copy_from.file_begin_; file_end_ = copy_from.file_end_; for_write_ = copy_from.for_write_; block_ = copy_from.block_; read_bound_ = copy_from.read_bound_; current_begin_ = 0; if (copy_from.IsPassthrough()) { current_end_ = copy_from.current_end_; ptr_ = copy_from.ptr_; } else { // Force call on next mmap. current_end_ = 0; ptr_ = NULL; } return *this; } Rolling::Rolling(int fd, bool for_write, std::size_t block, std::size_t read_bound, uint64_t offset, uint64_t amount) { current_begin_ = 0; current_end_ = 0; fd_ = fd; file_begin_ = offset; file_end_ = offset + amount; for_write_ = for_write; block_ = block; read_bound_ = read_bound; } void *Rolling::ExtractNonRolling(scoped_memory &out, uint64_t index, std::size_t size) { out.reset(); if (IsPassthrough()) return static_cast(get()) + index; uint64_t offset = index + file_begin_; // Round down to multiple of page size. uint64_t cruft = offset % static_cast(SizePage()); std::size_t map_size = static_cast(size + cruft); out.reset(MapOrThrow(map_size, for_write_, kFileFlags, true, fd_, offset - cruft), map_size, scoped_memory::MMAP_ALLOCATED); return static_cast(out.get()) + static_cast(cruft); } void Rolling::Roll(uint64_t index) { assert(!IsPassthrough()); std::size_t amount; if (file_end_ - (index + file_begin_) > static_cast(block_)) { amount = block_; current_end_ = index + amount - read_bound_; } else { amount = file_end_ - (index + file_begin_); current_end_ = index + amount; } ptr_ = static_cast(ExtractNonRolling(mem_, index, amount)) - index; current_begin_ = index; } } // namespace util