I have a circular buffer which is backed with file mapped memory (the buffer is in the size range of 8GB-512GB).
I am writing to (8 instances of) this memory in a sequential manner from the beginning to the end at which point it loops around back to the beginning.
It works fine until it reaches the end where it needs to perform two file mappings and loop around the memory, at which point IO performance is totally trashed and doesn't recover (even after several minutes). I can't quite figure it out.
using namespace boost::interprocess;
class mapping
{
public:
mapping()
{
}
mapping(file_mapping& file, mode_t mode, std::size_t file_size, std::size_t offset, std::size_t size)
: offset_(offset)
, mode_(mode)
{
const auto aligned_size = page_ceil(size + page_size());
const auto aligned_file_size = page_floor(file_size);
const auto aligned_file_offset = page_floor(offset % aligned_file_size);
const auto region1_size = std::min(aligned_size, aligned_file_size - aligned_file_offset);
const auto region2_size = aligned_size - region1_size;
if (region2_size)
{
const auto region1_address = mapped_region(file, read_only, 0, (region1_size + region2_size) * 2).get_address();
const auto region2_address = reinterpret_cast<char*>(region1_address) + region1_size;
region1_ = mapped_region(file, mode, aligned_file_offset, region1_size, region1_address);
region2_ = mapped_region(file, mode, 0, region2_size, region2_address);
}
else
{
region1_ = mapped_region(file, mode, aligned_file_offset, region1_size);
region2_ = mapped_region();
}
size_ = region1_.get_size() + region2_.get_size();
offset_ = aligned_file_offset;
}
auto offset() const -> std::size_t { return offset_; }
auto size() const -> std::size_t { return size_; }
auto data() const -> const void* { return region1_.get_address(); }
auto data() -> void* { return region1_.get_address(); }
auto flush(bool async = true) -> void
{
region1_.flush(async);
region2_.flush(async);
}
auto mode() const -> mode_t { return mode_; }
private:
std::size_t offset_ = 0;
std::size_t size_ = 0;
mode_t mode_;
mapped_region region1_;
mapped_region region2_;
};
struct loop_mapping::impl final
{
std::tr2::sys::path file_path_;
file_mapping file_mapping_;
std::size_t file_size_;
std::size_t map_size_ = page_floor(256000000ULL);
std::shared_ptr<mapping> mapping_ = std::shared_ptr<mapping>(new mapping());
std::shared_ptr<mapping> prev_mapping_;
bool write_;
public:
impl(std::tr2::sys::path path, bool write)
: file_path_(std::move(path))
, file_mapping_(file_path_.string().c_str(), write ? read_write : read_only)
, file_size_(page_floor(std::tr2::sys::file_size(file_path_)))
, write_(write)
{
REQUIRE(file_size_ >= map_size_ * 3);
}
~impl()
{
prev_mapping_.reset();
mapping_.reset();
}
auto data(std::size_t offset, std::size_t size, boost::optional<bool> write_opt) -> void*
{
offset = offset % page_floor(file_size_);
REQUIRE(size < file_size_ - map_size_ * 3);
const auto write = write_opt.get_value_or(write_);
REQUIRE(!write || write_);
if ((write && mapping_->mode() == read_only) || offset < mapping_->offset() || offset + size >= mapping_->offset() + mapping_->size())
{
auto new_mapping = std::make_shared<loop::mapping>(file_mapping_, write ? read_write : read_only, file_size_, page_floor(offset), std::max(size + page_size(), map_size_));
if (mapping_)
mapping_->flush((new_mapping->offset() % file_size_) < (mapping_->offset() % file_size_));
if (prev_mapping_)
prev_mapping_->flush(false);
prev_mapping_ = std::move(mapping_);
mapping_ = std::move(new_mapping);
}
return reinterpret_cast<char*>(mapping_->data()) + offset - mapping_->offset();
}
}
-
// 8 processes to 8 different files 128GB each.
loop_mapping loop(...);
for (auto n = 0; true; ++n)
{
auto src = get_new_data(5000000/8);
auto dst = loop.data(n * 5000000/8, 5000000/8, true);
std::memcpy(dst, src, 5000000/8); // This becomes very slow after loop around.
std::this_thread::sleep_for(std::chrono::seconds(1));
}
Any ideas?
Target System:
- 1x 3TB Seagate Constellation ES.3
- 2x Xeon E5-2400 (6 core, 2.6Ghz)
- 6x 8GB DDR3 1600Mhz ECC
- Windows Server 2012