diff options
Diffstat (limited to 'src/os/bluestore/Writer.cc')
-rw-r--r-- | src/os/bluestore/Writer.cc | 1425 |
1 files changed, 1425 insertions, 0 deletions
diff --git a/src/os/bluestore/Writer.cc b/src/os/bluestore/Writer.cc new file mode 100644 index 00000000000..41bae8d9cef --- /dev/null +++ b/src/os/bluestore/Writer.cc @@ -0,0 +1,1425 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2023 IBM + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "Writer.h" +#include "include/intarith.h" +#include "os/bluestore/bluestore_types.h" + +std::ostream& operator<<(std::ostream& out, const BlueStore::Writer::blob_data_printer& printer) +{ + out << std::hex; + uint32_t lof = printer.base_position; + for (const auto& q: printer.blobs) { + out << " " << lof << "~" << q.real_length; + if (q.is_compressed()) { + out << "(" << q.compressed_length << ")"; + } + lof += q.real_length; + } + out << std::dec; + return out; +} + +/// Empties range [offset~length] of object o that is in collection c. +/// Collects unused elements: +/// released - sequence of allocation units that are no longer used +/// pruned_blobs - set of blobs that are no longer used +/// shared_changed - set of shared blobs that are modified, +/// including the case of shared blob being empty +/// statfs_delta - delta of stats +/// returns: iterator to ExtentMap following last element removed +BlueStore::extent_map_t::iterator BlueStore::_punch_hole_2( + Collection* c, + OnodeRef& o, + uint32_t offset, + uint32_t length, + PExtentVector& released, + std::vector<BlobRef>& pruned_blobs, //completely emptied out blobs + std::set<SharedBlobRef>& shared_changed, //shared blobs that have changed + volatile_statfs& statfs_delta) +{ + ExtentMap& emap = o->extent_map; + uint32_t end = offset + length; + auto p = emap.maybe_split_at(offset); + while (p != emap.extent_map.end() && p->logical_offset < end) { + // here split tail extent, if needed + if (end < p->logical_end()) { + p = emap.split_at(p, end); + --p; + } + // here always whole lextent to drop + auto& bblob = p->blob->dirty_blob(); + uint32_t released_size = 0; + if (!bblob.is_shared()) { + released_size = + p->blob->put_ref_accumulate(c, p->blob_offset, p->length, &released); + } else { + // make sure shared blob is loaded + c->load_shared_blob(p->blob->get_shared_blob()); + // more complicated shared blob release + PExtentVector local_released; //no longer used by local blob + PExtentVector shared_released; //no longer used by shared blob too + p->blob->put_ref_accumulate(c, p->blob_offset, p->length, &local_released); + // filter local release disk regions + // through SharedBlob's multi-ref ref_map disk regions + bool unshare = false; //is there a chance that shared blob can be unshared? + // TODO - make put_ref return released_size directly + for (const auto& de: local_released) { + p->blob->get_shared_blob()->put_ref(de.offset, de.length, &shared_released, &unshare); + } + for (const auto& de : shared_released) { + released_size += de.length; + } + released.insert(released.end(), shared_released.begin(), shared_released.end()); + shared_changed.insert(p->blob->get_shared_blob()); + } + statfs_delta.allocated() -= released_size; + statfs_delta.stored() -= p->length; + if (bblob.is_compressed()) { + statfs_delta.compressed_allocated() -= released_size; + statfs_delta.compressed_original() -= p->length; + if (!bblob.has_disk()) { + statfs_delta.compressed() -= bblob.get_compressed_payload_length(); + } + } + if (!bblob.has_disk()) { + pruned_blobs.push_back(p->blob); + if (p->blob->is_spanning()) { + emap.spanning_blob_map.erase(p->blob->id); + p->blob->id = -1; + } + } + Extent* e = &(*p); + p = emap.extent_map.erase(p); + delete e; + } + return p; +} + + +/// Signals that a range [offset~length] is no longer used. +/// Collects allocation units that became unused into *released_disk. +/// Returns: +/// disk space size to release +uint32_t BlueStore::Blob::put_ref_accumulate( + Collection *coll, + uint32_t offset, + uint32_t length, + PExtentVector *released_disk) +{ + ceph_assert(length > 0); + uint32_t res = 0; + auto [in_blob_offset, in_blob_length] = used_in_blob.put_simple(offset, length); + if (in_blob_length != 0) { + bluestore_blob_t& b = dirty_blob(); + res = b.release_extents(in_blob_offset, in_blob_length, released_disk); + return res; + } + return res; +} + +inline void BlueStore::Blob::add_tail( + uint32_t new_blob_size, + uint32_t min_release_size) +{ + ceph_assert(p2phase(new_blob_size, min_release_size) == 0); + dirty_blob().add_tail(new_blob_size); + used_in_blob.add_tail(new_blob_size, min_release_size); +} + +inline void bluestore_blob_use_tracker_t::init_and_ref( + uint32_t full_length, + uint32_t tracked_chunk) +{ + ceph_assert(p2phase(full_length, tracked_chunk) == 0); + uint32_t _num_au = full_length / tracked_chunk; + au_size = tracked_chunk; + if ( _num_au > 1) { + allocate(_num_au); + for (uint32_t i = 0; i < num_au; i++) { + bytes_per_au[i] = tracked_chunk; + } + } else { + total_bytes = full_length; + } +} + +inline void bluestore_blob_t::allocated_full( + uint32_t length, + PExtentVector&& allocs) +{ + ceph_assert(extents.size() == 0); + extents.swap(allocs); + logical_length = length; +} + +// split data +inline bufferlist split_left(bufferlist& data, uint32_t split_pos) +{ + bufferlist left; + left.substr_of(data, 0, split_pos); + data.splice(0, split_pos); + return left; +} +inline bufferlist split_right(bufferlist& data, uint32_t split_pos) +{ + bufferlist right; + data.splice(split_pos, data.length() - split_pos, &right); + return right; +} + +// should _maybe_expand_blob go to Blob ? +inline void BlueStore::Writer::_maybe_expand_blob( + Blob* blob, + uint32_t new_blob_size) +{ + ceph_assert(blob->get_blob().get_logical_length() > 0); + if (blob->get_blob().get_logical_length() < new_blob_size) { + uint32_t min_release_size = blob->get_blob_use_tracker().au_size; + blob->add_tail(new_blob_size, min_release_size); + } +} + +#define dout_context bstore->cct +#define dout_subsys ceph_subsys_bluestore + +//general levels: +// 10 init, fundamental state changes (not present here) +// 15 key functions, important params +// 20 most functions, most params +// 25 all functions, key variables +// 30 prints passing data (not used here) +// modifiers of extent, blob, onode printout: +// +0 nick + sdisk + suse +// +1 nick + sdisk + suse + sbuf +// +2 nick + sdisk + suse + sbuf + schk + attrs +// +3 ptr + disk + use + buf +// +4 ptr + disk + use + chk + buf + attrs +using exmp_it = BlueStore::extent_map_t::iterator; + +uint16_t BlueStore::Writer::debug_level_to_pp_mode(CephContext* cct) { + static constexpr uint16_t modes[5] = { + P::NICK + P::SDISK + P::SUSE, + P::NICK + P::SDISK + P::SUSE + P::SBUF, + P::NICK + P::SDISK + P::SUSE + P::SBUF + P::SCHK + P::ATTRS, + P::PTR + P::DISK + P::USE + P::BUF, + P::PTR + P::DISK + P::USE + P::BUF + P::CHK + P::ATTRS + }; + int level = cct->_conf->subsys.get_gather_level(dout_subsys); + if (level >= 30) return modes[4]; + if (level <= 15) return modes[0]; + return modes[level % 5]; +} + + +inline BlueStore::extent_map_t::iterator BlueStore::Writer::_find_mutable_blob_left( + BlueStore::extent_map_t::iterator it, + uint32_t search_begin, // only interested in blobs that are + uint32_t search_end, // within range [begin - end) + uint32_t mapmust_begin,// for 'unused' case: the area + uint32_t mapmust_end) // [begin - end) must be mapped +{ + extent_map_t& map = onode->extent_map.extent_map; + if (it == map.begin()) { + return map.end(); + } + do { + --it; + if (it->logical_offset < search_begin) break; + if (search_begin > it->blob_start()) continue; + if (it->blob_end() > search_end) continue; + if (it->blob_start() > mapmust_begin) continue; + auto bblob = it->blob->get_blob(); + if (!bblob.is_mutable()) continue; + if (bblob.has_csum()) { + uint32_t mask = (mapmust_begin - it->blob_start()) | + (mapmust_end - it->blob_start()); + if (p2phase(mask, bblob.get_csum_chunk_size()) != 0) continue; + } + if (bblob.has_unused()) { + // very difficult to expand blob with unused (our unused logic is ekhm) + if (it->blob_end() <= mapmust_end) continue; + } + return it; + } while (it != map.begin()); + return map.end(); +} + +inline BlueStore::extent_map_t::iterator BlueStore::Writer::_find_mutable_blob_right( + BlueStore::extent_map_t::iterator it, + uint32_t search_begin, // only interested in blobs that are + uint32_t search_end, // within range [begin - end) + uint32_t mapmust_begin, // for 'unused' case: the area + uint32_t mapmust_end) // [begin - end) must be mapped +{ + extent_map_t& map = onode->extent_map.extent_map; + for (;it != map.end();++it) { + if (it->logical_offset >= search_end) break; + if (search_begin > it->blob_start()) continue; + if (it->blob_end() > search_end) continue; + if (it->blob_start() > mapmust_begin) continue; + auto bblob = it->blob->get_blob(); + if (!bblob.is_mutable()) continue; + if (bblob.has_csum()) { + uint32_t mask = (mapmust_begin - it->blob_start()) | + (mapmust_end - it->blob_start()); + if (p2phase(mask, bblob.get_csum_chunk_size()) != 0) continue; + } + if (bblob.has_unused()) { + // very difficult to expand blob with unused (our unused logic is ekhm) + if (it->blob_end() <= mapmust_end) continue; + } + return it; + break; + }; + return map.end(); +} + +void BlueStore::Writer::_get_disk_space( + uint32_t length, + PExtentVector& dst) +{ + while (length > 0) { + ceph_assert(disk_allocs.it->length > 0); + uint32_t s = std::min(length, disk_allocs.it->length - disk_allocs.pos); + length -= s; + dst.emplace_back(disk_allocs.it->offset + disk_allocs.pos, s); + disk_allocs.pos += s; + if (disk_allocs.it->length == disk_allocs.pos) { + ++disk_allocs.it; + disk_allocs.pos = 0; + } + } +} + +inline void BlueStore::Writer::_crop_allocs_to_io( + PExtentVector& disk_extents, + uint32_t crop_front, + uint32_t crop_back) +{ + if (crop_front > 0) { + ceph_assert(disk_extents.front().length > crop_front); + disk_extents.front().offset += crop_front; + disk_extents.front().length -= crop_front; + } + if (crop_back > 0) { + ceph_assert(disk_extents.back().length > crop_back); + disk_extents.back().length -= crop_back; + } +} + +/* +1. _blob_put_data (tool) + Modifies existing blob to contain specific data, does not care + for allocations. Does not check anything. + +2. _blob_put_data_subau + Modifies existing blob on range that is allocated, but 'unused'. + Data is block aligned. No ref++; + +3. _blob_put_data_allocate + Modifies existing blob on unallocated range, puts allocations. + Data is au aligned. No ref++; + +4. _blob_put_data_combined + No reason to combine 2 + 3. + +5. _blob_create_with_data + Create new blob with wctx specs. + Gives blob allocation units. Puts data to blob. Sets unused. + No ref++. + +6. _blob_create_full + Create new blob with wctx specs. + Gives blob allocation units. Puts data to blob. No unused. + Full ref++ done. +*/ + +inline void BlueStore::Writer::_blob_put_data( + Blob* blob, + uint32_t in_blob_offset, + bufferlist disk_data) +{ + auto& bblob = blob->dirty_blob(); + uint32_t in_blob_end = in_blob_offset + disk_data.length(); + // update csum, used_in_blob and unused + if (bblob.has_csum()) { + // calc_csum has fallback for csum == NONE, but is not inlined + bblob.calc_csum(in_blob_offset, disk_data); + } + bblob.mark_used(in_blob_offset, in_blob_end - in_blob_offset); + // do not update ref, we do not know how much of the data is actually used +} + +/// Modifies blob to accomodate new data. +/// For partial AU overwrites only. +/// Requirements: +/// - target range is block aligned +/// - has unused +/// - target range is 'unused' +/// By extension: +/// - csum & tracker are large enough +/// No ref++. +/// Similar to _blob_put_data_allocate, but does not put new allocations +inline void BlueStore::Writer::_blob_put_data_subau( + Blob* blob, + uint32_t in_blob_offset, + bufferlist disk_data) +{ + auto& bblob = blob->dirty_blob(); + uint32_t in_blob_end = in_blob_offset + disk_data.length(); + ceph_assert(bblob.is_mutable()); + //TODO WHY? - ceph_assert(bblob.has_unused()); + //TODO WHY? - ceph_assert(bblob.is_unused(in_blob_offset, in_blob_end - in_blob_offset)); + uint32_t chunk_size = bblob.get_chunk_size(bstore->block_size); + ceph_assert(p2phase(in_blob_offset, chunk_size) == 0); + ceph_assert(p2phase(in_blob_end, chunk_size) == 0); + ceph_assert(bblob.get_logical_length() >= in_blob_end); + _blob_put_data(blob, in_blob_offset, disk_data); +} + + +/// Modifies blob to accomodate new data. +/// For AU aligned operations only. +/// Requirements: +/// - blob is mutable +/// - target range is AU aligned +/// - csum and tracker are large enough +/// Calculates csum, clears unused. +/// Moves disk space from disk_allocs to blob. +/// No ref++. +inline void BlueStore::Writer::_blob_put_data_allocate( + Blob* blob, + uint32_t in_blob_offset, + bufferlist disk_data) +{ + dout(25) << __func__ << "@" << std::hex << in_blob_offset + << "~" << disk_data.length() << std::dec << " -> " << blob->print(pp_mode) << dendl; + auto& bblob = blob->dirty_blob(); + uint32_t in_blob_end = in_blob_offset + disk_data.length(); + ceph_assert(bblob.is_mutable()); + ceph_assert(p2phase(in_blob_offset, (uint32_t)bstore->min_alloc_size) == 0); + ceph_assert(p2phase(in_blob_end, (uint32_t)bstore->min_alloc_size) == 0); + ceph_assert(bblob.get_logical_length() >= in_blob_end); + _blob_put_data(blob, in_blob_offset, disk_data); + PExtentVector blob_allocs; + _get_disk_space(in_blob_end - in_blob_offset, blob_allocs); + bblob.allocated(in_blob_offset, in_blob_end - in_blob_offset, blob_allocs); + _schedule_io(blob_allocs, disk_data); + + dout(25) << __func__ << " 0x" << std::hex << disk_data.length() + << "@" << in_blob_offset << std::dec << " -> " + << blob->print(pp_mode) << " no ref yet" << dendl; +} + +/// Modifies blob to accomodate new data. +/// Only operates on new AUs. Takes those AUs from 'disk_allocs'. +/// Requirements: +/// - blob is mutable +/// - target range is csum and tracker aligned +/// - csum and tracker are large enough +/// No AU alignment requirement. +/// Calculates csum, clears unused. +/// No ref++. +/// Very similiar to _blob_put_data_allocate, but also allows for partial AU writes. +/// to newly allocated AUs +inline void BlueStore::Writer::_blob_put_data_subau_allocate( + Blob* blob, + uint32_t in_blob_offset, + bufferlist disk_data) +{ + dout(25) << __func__ << "@" << std::hex << in_blob_offset + << "~" << disk_data.length() << std::dec << " -> " << blob->print(pp_mode) << dendl; + auto& bblob = blob->dirty_blob(); + uint32_t au_size = bstore->min_alloc_size; + uint32_t in_blob_end = in_blob_offset + disk_data.length(); + uint32_t chunk_size = bblob.get_chunk_size(bstore->block_size); + ceph_assert(bblob.is_mutable()); + ceph_assert(p2phase(in_blob_offset, chunk_size) == 0); + ceph_assert(p2phase(in_blob_end, chunk_size) == 0); + ceph_assert(bblob.get_logical_length() >= in_blob_end); + uint32_t in_blob_alloc_offset = p2align(in_blob_offset, au_size); + uint32_t in_blob_alloc_end = p2roundup(in_blob_end, au_size); + _blob_put_data(blob, in_blob_offset, disk_data); + PExtentVector blob_allocs; + _get_disk_space(in_blob_alloc_end - in_blob_alloc_offset, blob_allocs); + bblob.allocated(in_blob_alloc_offset, in_blob_alloc_end - in_blob_alloc_offset, blob_allocs); + PExtentVector& disk_extents = blob_allocs; + _crop_allocs_to_io(disk_extents, in_blob_offset - in_blob_alloc_offset, + in_blob_alloc_end - in_blob_offset - disk_data.length()); + _schedule_io(disk_extents, disk_data); + dout(25) << __func__ << " 0x" << std::hex << disk_data.length() + << "@" << in_blob_offset << std::dec << " -> " + << blob->print(pp_mode) << " no ref yet" << dendl; +} + + +/// Create new blob with wctx specs. +/// Allowed for block and AU alignments. +/// Requirements: +/// - target range is block aligned +/// Calculates csum, sets unused. +/// Moves disk space from disk_allocs to blob. +/// No ref++. +BlueStore::BlobRef BlueStore::Writer::_blob_create_with_data( + uint32_t in_blob_offset, + bufferlist& disk_data) +{ + uint32_t block_size = bstore->block_size; + uint32_t min_alloc_size = bstore->min_alloc_size; + ceph_assert(p2phase(in_blob_offset, block_size) == 0); + ceph_assert(p2phase(disk_data.length(), block_size) == 0); + BlobRef blob = onode->c->new_blob(); + bluestore_blob_t &bblob = blob->dirty_blob(); + uint32_t data_length = disk_data.length(); + uint32_t alloc_offset = p2align(in_blob_offset, min_alloc_size); + uint32_t blob_length = p2roundup(in_blob_offset + data_length, min_alloc_size); + uint32_t tracked_unit = min_alloc_size; + uint32_t csum_length_mask = in_blob_offset | data_length; //to find 2^n common denominator + uint32_t csum_order = // conv 8 -> 32 so "<<" does not overflow + std::min<uint32_t>(wctx->csum_order, std::countr_zero(csum_length_mask)); + if (wctx->csum_type != Checksummer::CSUM_NONE) { + bblob.init_csum(wctx->csum_type, csum_order, blob_length); + bblob.calc_csum(in_blob_offset, disk_data); + tracked_unit = std::max(1u << csum_order, min_alloc_size); + } + blob->dirty_blob_use_tracker().init(blob_length, tracked_unit); + PExtentVector blob_allocs; + _get_disk_space(blob_length - alloc_offset, blob_allocs); + bblob.allocated(alloc_offset, blob_length - alloc_offset, blob_allocs); + //^sets also logical_length = blob_length + dout(25) << __func__ << " @0x" << std::hex << in_blob_offset + << "~" << disk_data.length() + << " alloc_offset=" << alloc_offset + << " -> " << blob->print(pp_mode) << dendl; + PExtentVector& disk_extents = blob_allocs; + _crop_allocs_to_io(disk_extents, in_blob_offset - alloc_offset, + blob_length - in_blob_offset - disk_data.length()); + _schedule_io(disk_extents, disk_data); + return blob; +} + +/// Create new blob with wctx specs, fill with data. +/// Requirements: +/// - data is AU aligned +/// Calculates csum, sets unused. +/// Moves disk space from disk_allocs to blob. +/// Full ref done. +BlueStore::BlobRef BlueStore::Writer::_blob_create_full( + bufferlist& disk_data) +{ + uint32_t min_alloc_size = bstore->min_alloc_size; + uint32_t blob_length = disk_data.length(); + ceph_assert(p2phase<uint32_t>(blob_length, bstore->min_alloc_size) == 0); + BlobRef blob = onode->c->new_blob(); + + //uint32_t in_blob_end = disk_data.length(); + bluestore_blob_t &bblob = blob->dirty_blob(); + uint32_t tracked_unit = min_alloc_size; + uint32_t csum_order = // conv 8 -> 32 so "<<" does not overflow + std::min<uint32_t>(wctx->csum_order, std::countr_zero(blob_length)); + if (wctx->csum_type != Checksummer::CSUM_NONE) { + bblob.init_csum(wctx->csum_type, csum_order, blob_length); + bblob.calc_csum(0, disk_data); + tracked_unit = std::max(1u << csum_order, min_alloc_size); + } + //std::cout << "blob_length=" << blob_length << std::endl; + blob->dirty_blob_use_tracker().init_and_ref(blob_length, tracked_unit); + PExtentVector blob_allocs; + _get_disk_space(blob_length, blob_allocs); + _schedule_io(blob_allocs, disk_data); //have to do before move() + bblob.allocated_full(blob_length, std::move(blob_allocs)); + bblob.mark_used(0, blob_length); //todo - optimize; this obviously clears it + return blob; +} + +/** + * Note from developer + * This module tries to keep naming convention: + * 1) Data location in object is named "position/location/begin", not "offset". + * 2) Data location within blob is named "offset". + * 3) Disk location is named "position/location", not "offset". + */ + +/* + note for myself + I decided to not mix sub-au writes and normal writes. + When there is sub-au write to blob there are 2 cases: + a) entire write region is "unused" + In this case we do speed up direct write + b) some part is "used" + 1) read block-wise and do deferred + 2) read au-wise and have choice deferred / direct + end note for myself + + Let's treat case of 'unused' as special. + If such thing happens, move execution of it outside + optimization logic. + So, before going to main processing we do 'unused'. + Then we crop the data and continue with rest. + It is only first and last blob that can be unused. + + The real use case of unused is when AU is 64k, block is 4k. + There is a difference in expected size of deferred on appends: + without its avg ~32k, with it only ~2k. + The unused feature would be useful if ZERO_OP could reset used->unused, + but this one is not easy. + This is why we do not bother with considering unused + in non-head / non-tail blobs. + Which change of default AU 64k->4k, its importance dwindles. + + note for myself + The presence of blobs with unused makes impact on alignment restrictions? + It seems reasonable that expand-read should be to block size. + Even if we allocate larger AU, there is no need to write to empty. + Overwrite must be deferred or to unused. + Can I just make a determination that unused is an excuse not to do deferred? + Or is writing to unused just a signal that reallocation is not an option? + Clearly if something is unused, then it does exist. + So write-selection function could make a determination what to do. + But having limitations complicates optimization alg + If I sacrifice optimization of defered, will I be done? + +*/ + +/** + * Transfer to disk modulated by unused() bits + * + * Blob can have unused() bits; it encodes which disk blocks are allocated, + * but have never been used. Those bits determine if we can do direct or + * deferred write is required. + * Function has \ref Writer::test_write_divertor bypass for testing purposes. + * + * disk_position - Location must be disk block aligned. + * data - Data to write. + * mask - Set of unused() bits, starting from bit 0. + * chunk_size - Size covered by one "mask" bit. + */ +inline void BlueStore::Writer::_schedule_io_masked( + uint64_t disk_position, + bufferlist data, + bluestore_blob_t::unused_t mask, + uint32_t chunk_size) +{ + if (test_write_divertor == nullptr) { + int32_t data_left = data.length(); + while (data_left > 0) { + bool chunk_is_unused = (mask & 1) != 0; + bufferlist ddata; + data.splice(0, chunk_size, &ddata); + if (chunk_is_unused) { + bstore->bdev->aio_write(disk_position, ddata, &txc->ioc, false); + bstore->logger->inc(l_bluestore_write_small_unused); + } else { + bluestore_deferred_op_t *op = bstore->_get_deferred_op(txc, ddata.length()); + op->op = bluestore_deferred_op_t::OP_WRITE; + op->extents.emplace_back(bluestore_pextent_t(disk_position, chunk_size)); + op->data = ddata; + bstore->logger->inc(l_bluestore_issued_deferred_writes); + bstore->logger->inc(l_bluestore_issued_deferred_write_bytes, ddata.length()); + } + disk_position += chunk_size; + data_left -= chunk_size; + mask >>= 1; + } + ceph_assert(data_left == 0); + } else { + int32_t data_left = data.length(); + while (data_left > 0) { + bool chunk_is_unused = (mask & 1) != 0; + bufferlist ddata; + data.splice(0, chunk_size, &ddata); + test_write_divertor->write(disk_position, ddata, !chunk_is_unused); + disk_position += chunk_size; + data_left -= chunk_size; + mask >>= 1; + } + ceph_assert(data_left == 0); + } +} + +/** + * Transfer to disk + * + * Initiates transfer of data to disk. + * Depends on \ref Writer::do_deferred to select direct or deferred action. + * If \ref Writer::test_write_divertor bypass is set it overrides default path. + * + * disk_extents - Target disk blocks + * data - Data. + */ +inline void BlueStore::Writer::_schedule_io( + const PExtentVector& disk_extents, + bufferlist data) +{ + if (test_write_divertor == nullptr) { + if (do_deferred) { + bluestore_deferred_op_t *op = bstore->_get_deferred_op(txc, data.length()); + op->op = bluestore_deferred_op_t::OP_WRITE; + op->extents = disk_extents; + op->data = data; + bstore->logger->inc(l_bluestore_issued_deferred_writes); + bstore->logger->inc(l_bluestore_issued_deferred_write_bytes, data.length()); + } else { + for (const auto& loc : disk_extents) { + bufferlist data_chunk; + data.splice(0, loc.length, &data_chunk); + bstore->bdev->aio_write(loc.offset, data_chunk, &txc->ioc, false); + } + ceph_assert(data.length() == 0); + } + } else { + for (const auto& loc: disk_extents) { + bufferlist data_chunk; + data.splice(0, loc.length, &data_chunk); + test_write_divertor->write(loc.offset, data_chunk, do_deferred); + } + ceph_assert(data.length() == 0); + } +} + +/** + * Read part of own data + * + * Rados protocol allows for byte aligned writes. Disk blocks are larger and + * we need to read data that is around to form whole block. + * + * If \ref Writer::test_read_divertor is set it overrides default. + */ +inline bufferlist BlueStore::Writer::_read_self( + uint32_t position, + uint32_t length) +{ + if (test_read_divertor == nullptr) { + bufferlist result; + int r; + r = bstore->_do_read(onode->c, onode, position, length, result); + ceph_assert(r >= 0 && r <= (int)length); + size_t zlen = length - r; + if (zlen) { + result.append_zero(zlen); + bstore->logger->inc(l_bluestore_write_pad_bytes, zlen); + } + bstore->logger->inc(l_bluestore_write_small_pre_read); + return result; + } else { + return test_read_divertor->read(position, length); + } +} + +// used to put data to blobs that does not require allocation +// crops data from bufferlist, +// returns disk pos and length and mask +// or updates wctx does deferred/direct +void BlueStore::Writer::_try_reuse_allocated_l( + exmp_it after_punch_it, // hint, we could have found it ourselves + uint32_t& logical_offset, // will fix value if something consumed + uint32_t ref_end_offset, // limit to ref, if data was padded + blob_data_t& bd) // modified when consumed +{ + uint32_t search_stop = p2align(logical_offset, (uint32_t)wctx->target_blob_size); + uint32_t au_size = bstore->min_alloc_size; + uint32_t block_size = bstore->block_size; + ceph_assert(!bd.is_compressed()); + ceph_assert(p2phase<uint32_t>(logical_offset, au_size) != 0); + BlueStore::ExtentMap& emap = onode->extent_map; + auto it = after_punch_it; + while (it != emap.extent_map.begin()) { + --it; + // first of all, check it we can even use the blob here + if (it->blob_end() < search_stop) break; + if (it->blob_end() <= logical_offset) continue; // need at least something + Blob* b = it->blob.get(); + dout(25) << __func__ << " trying " << b->print(pp_mode) << dendl; + bluestore_blob_t bb = b->dirty_blob(); + if (!bb.is_mutable()) continue; + // all offsets must be aligned to blob chunk_size, + // which is larger of csum and device block granularity + bufferlist& data = bd.disk_data; + uint32_t chunk_size = it->blob->get_blob().get_chunk_size(block_size); + if (p2phase(logical_offset, chunk_size) != 0) continue; + // this blob can handle required granularity + // the blob might, or might not be allocated where we need it + // note we operate on 1 AU max + uint32_t blob_offset = it->blob_start(); + uint32_t want_subau_begin = logical_offset; //it is chunk_size aligned + uint32_t want_subau_end = p2roundup(logical_offset, au_size); + if (logical_offset + data.length() < want_subau_end) { + // we do not have enough data to cut at AU, try chunk + want_subau_end = logical_offset + data.length(); + if (p2phase(want_subau_end, chunk_size) !=0) continue; + } + if (want_subau_begin < it->blob_start()) continue; + if (want_subau_begin >= it->blob_end()) continue; + uint32_t in_blob_offset = want_subau_begin - blob_offset; + uint64_t subau_disk_offset = bb.get_allocation_at(in_blob_offset); + if (subau_disk_offset == bluestore_blob_t::NO_ALLOCATION) continue; + dout(25) << __func__ << " 0x" << std::hex << want_subau_begin << "-" + << want_subau_end << std::dec << " -> " << b->print(pp_mode) << dendl; + uint32_t data_size = want_subau_end - want_subau_begin; + bufferlist data_at_left = split_left(data, data_size); + bd.real_length -= data_size; + uint32_t mask = bb.get_unused_mask(in_blob_offset, data_size, chunk_size); + _blob_put_data_subau(b, in_blob_offset, data_at_left); + // transfer do disk + _schedule_io_masked(subau_disk_offset, data_at_left, mask, chunk_size); + + uint32_t ref_end = std::min(ref_end_offset, want_subau_end); + //fixme/improve - need something without stupid extras - that is without coll + b->get_ref(onode->c, in_blob_offset, ref_end - want_subau_begin); + Extent *le = new Extent( + want_subau_begin, in_blob_offset, ref_end - want_subau_begin, it->blob); + dout(20) << __func__ << " new extent " << le->print(pp_mode) << dendl; + emap.extent_map.insert(*le); + + logical_offset += data_size; + break; + } +} + +// used to put data to blobs that does not require allocation +// crops data from bufferlist, +// returns disk pos and length and mask +// or updates wctx does deferred/direct +// AU | AU | AU +// |bl|bl|bl|bl|bl|bl|bl|bl| +// |csum |csum |csum |csum | +// datadatadatadatada case A - input rejected +// tadatadat case B - input rejected +void BlueStore::Writer::_try_reuse_allocated_r( + exmp_it after_punch_it, // hint, we could have found it ourselves + uint32_t& end_offset, // will fix value if something consumed + uint32_t ref_end_offset, // limit to ref, if data was padded + blob_data_t& bd) // modified when consumed +{ + // this function should be called only when its applicable + // that is, data is not compressed and is not AU aligned + uint32_t au_size = bstore->min_alloc_size; + uint32_t block_size = bstore->block_size; + uint32_t blob_size = wctx->target_blob_size; + uint32_t search_end = p2roundup(end_offset, blob_size); + ceph_assert(!bd.is_compressed()); + ceph_assert(p2phase<uint32_t>(end_offset, au_size) != 0); + BlueStore::ExtentMap& emap = onode->extent_map; + for (auto& it = after_punch_it; it != emap.extent_map.end(); ++it) { + // first of all, check it we can even use the blob here + if (it->logical_offset >= search_end) break; + Blob* b = it->blob.get(); + dout(25) << __func__ << " trying " << b->print(pp_mode) << dendl; + bluestore_blob_t bb = b->dirty_blob(); + if (!bb.is_mutable()) continue; + + // all offsets must be aligned to blob chunk_size, + // which is larger of csum and device block granularity + bufferlist& data = bd.disk_data; + uint32_t chunk_size = it->blob->get_blob().get_chunk_size(block_size); + if (p2phase(end_offset, chunk_size) != 0) continue; //case A + uint32_t blob_offset = it->blob_start(); + uint32_t want_subau_begin = p2align(end_offset, au_size); //we operate on 1 AU max + uint32_t want_subau_end = end_offset; //it is chunk_size aligned + if (data.length() < end_offset - want_subau_begin) { + // we do not have enough data to cut at AU, fallback to chunk + want_subau_begin = end_offset - data.length(); + if (p2phase(want_subau_begin, chunk_size) != 0) continue; //case B + } + if (want_subau_begin < it->blob_start()) continue; + if (want_subau_begin >= it->blob_end()) continue; + uint32_t in_blob_offset = want_subau_begin - blob_offset; + uint64_t subau_disk_offset = bb.get_allocation_at(in_blob_offset); + if (subau_disk_offset == bluestore_blob_t::NO_ALLOCATION) continue; + dout(25) << __func__ << " 0x" << std::hex << want_subau_begin << "-" + << want_subau_end << std::dec << " -> " << b->print(pp_mode) << dendl; + uint32_t data_size = want_subau_end - want_subau_begin; + bufferlist data_at_right = split_right(data, data.length() - data_size); + bd.real_length -= data_size; + uint32_t mask = bb.get_unused_mask(in_blob_offset, data_size, chunk_size); + _blob_put_data_subau(b, in_blob_offset, data_at_right); + //transfer to disk + _schedule_io_masked(subau_disk_offset, data_at_right, mask, chunk_size); + + uint32_t ref_end = std::min(ref_end_offset, want_subau_end); + //fixme/improve - need something without stupid extras - that is without coll + b->get_ref(onode->c, in_blob_offset, ref_end - want_subau_begin); + Extent *le = new Extent( + want_subau_begin, in_blob_offset, ref_end - want_subau_begin, it->blob); + dout(20) << __func__ << " new extent " << le->print(pp_mode) << dendl; + emap.extent_map.insert(*le); + + end_offset -= data_size; + break; + } +} + +/** + * Export some data to neighboring blobs. + * + * Sometimes punch_hole_2 will clear only part of AU. + * Example: AU = 64K, DiskBlock = 4K, CSUM = 16K. + * Punch_hole_2 will always align to max(DiskBlock, CSUM) and get rid of whole AUs, + * but the boundary ones might need to leave some data intact, leaving some + * space unused. This function tries to use that space. + * + * If possible function cuts portions of data from first and last + * element of blob_data_t sequence. Params logical_offset, end_offset and + * ref_end_offset are updated to reflect data truncation. + * Only uncompressed input data is eligiable for being moved to other blobs. + * + * logical_offset - In-object offset of first byte in bd. + * end_offset - Offset of last byte in bd. + * ref_end_offset - Last byte that should be part of object; ref_end_offset <= end_offset. + * bd - Continous sequence of data blocks to be put to object. + * after_punch_it - Hint from punch_hole_2. + * Blobs to modify will be either left of it (for left search), + * or right of it (for right side search). + */ +void BlueStore::Writer::_try_put_data_on_allocated( + uint32_t& logical_offset, + uint32_t& end_offset, + uint32_t& ref_end_offset, + blob_vec& bd, + exmp_it after_punch_it) +{ + const char* func_name = __func__; + auto print = [&](const char* caption) { + dout(25) << func_name << caption << std::hex << logical_offset << ".." + << end_offset << " ref_end=" << ref_end_offset << " bd="; + uint32_t lof = logical_offset; + for (const auto& q: bd) { + *_dout << " " << lof << "~" << q.disk_data.length(); + lof += q.disk_data.length(); + } + *_dout << std::dec << dendl; + }; + print(" IN "); + ceph_assert(bstore->min_alloc_size != bstore->block_size); + ceph_assert(bd.size() >= 1); + if (!bd[0].is_compressed() && + p2phase<uint32_t>(logical_offset, bstore->min_alloc_size) != 0) { + // check if we have already allocated space to fill + _try_reuse_allocated_l(after_punch_it, logical_offset, ref_end_offset, bd[0]); + } + if (bd[0].real_length == 0) { + bd.erase(bd.begin()); + } + if (logical_offset == end_offset) { + // it is possible that we already consumed all + goto out; + } + print(" MID "); + { + ceph_assert(bd.size() >= 1); + auto &bd_back = bd.back(); + if (!bd_back.is_compressed() && + p2phase<uint32_t>(end_offset, bstore->min_alloc_size) != 0) { + // check if we have some allocated space to fill + _try_reuse_allocated_r(after_punch_it, end_offset, ref_end_offset, bd_back); + } + if (bd_back.real_length == 0) { + bd.erase(bd.end() - 1); + } + } + out: + print(" OUT "); +} + +/** + * Puts data to onode by creating new blobs/extents. + * + * Does not check if data can be merged into other blobs are done. + * Requires that the target region is already emptied (\ref punch_hole_2). + * + * Input data is a continous sequence of blob_data_t segments + * that starts at logical_offset. + * This is the final step in processing write op. + * + * logical_offset - Offset of first blob_data_t element. + * ref_end_offset - Actual data end, it might be earlier then last blob_data_t. + * It happens because we pad data to disk block alignment, + * while we preserve logical range of put data. + * bd_it..bd_end - Sequence of blob_data_t to put. + */ +void BlueStore::Writer::_do_put_new_blobs( + uint32_t logical_offset, + uint32_t ref_end_offset, + blob_vec::iterator& bd_it, + blob_vec::iterator bd_end) +{ + extent_map_t& emap = onode->extent_map.extent_map; + uint32_t blob_size = wctx->target_blob_size; + while (bd_it != bd_end) { + Extent* le; + if (!bd_it->is_compressed()) { + // only 1st blob to write can have blob_location != logical_offset + uint32_t blob_location = p2align(logical_offset, blob_size); + BlobRef new_blob; + uint32_t in_blob_offset = logical_offset - blob_location; + uint32_t ref_end = std::min(ref_end_offset, logical_offset + bd_it->disk_data.length()); + if (blob_location == logical_offset && + bd_it->disk_data.length() >= blob_size && + ref_end_offset - blob_location >= blob_size) { + new_blob = _blob_create_full(bd_it->disk_data); + // all already ref'ed + } else { + new_blob = _blob_create_with_data(in_blob_offset, bd_it->disk_data); + new_blob->get_ref(onode->c, in_blob_offset, ref_end - blob_location - in_blob_offset); + } + le = new Extent( + logical_offset, in_blob_offset, ref_end - logical_offset, new_blob); + dout(20) << __func__ << " new extent+blob " << le->print(pp_mode) << dendl; + emap.insert(*le); + logical_offset = ref_end; + } else { + // compressed + ceph_assert(false); + } + bstore->logger->inc(l_bluestore_write_big); + bstore->logger->inc(l_bluestore_write_big_bytes, le->length); + ++bd_it; + } +} + +void BlueStore::Writer::_do_put_blobs( + uint32_t logical_offset, + uint32_t data_end_offset, + uint32_t ref_end_offset, + blob_vec& bd, + exmp_it after_punch_it) +{ + Collection* coll = onode->c; + extent_map_t& emap = onode->extent_map.extent_map; + uint32_t au_size = bstore->min_alloc_size; + uint32_t blob_size = wctx->target_blob_size; + auto bd_it = bd.begin(); + exmp_it to_it; + uint32_t left_bound = p2align(logical_offset, blob_size); + uint32_t right_bound = p2roundup(logical_offset, blob_size); + // Try to put first data pack to already existing blob + if (!bd_it->is_compressed()) { + // it is thinkable to put the data to some blob + exmp_it left_b = _find_mutable_blob_left( + after_punch_it, left_bound, right_bound, + logical_offset, logical_offset + bd_it->disk_data.length()); + if (left_b != emap.end()) { + uint32_t in_blob_offset = logical_offset - left_b->blob_start(); + uint32_t in_blob_end = in_blob_offset + bd_it->disk_data.length(); + uint32_t data_end_offset = logical_offset + bd_it->disk_data.length(); + _maybe_expand_blob(left_b->blob.get(), p2roundup(in_blob_end, au_size)); + _blob_put_data_subau_allocate( + left_b->blob.get(), in_blob_offset, bd_it->disk_data); + uint32_t ref_end = std::min(ref_end_offset, data_end_offset); + //fixme/improve - need something without stupid extras - that is without coll + left_b->blob->get_ref(coll, in_blob_offset, ref_end - logical_offset); + Extent *le = new Extent( + logical_offset, in_blob_offset, ref_end - logical_offset, left_b->blob); + dout(20) << __func__ << " new extent " << le->print(pp_mode) << dendl; + emap.insert(*le); + logical_offset = ref_end; + ++bd_it; + bstore->logger->inc(l_bluestore_write_small); + bstore->logger->inc(l_bluestore_write_small_bytes, le->length); + } else { + // it is still possible to use first bd and put it into + // blob after punch_hole + // can blob before punch_hole be different then blob after punch_hole ? + } + } + if (bd_it != bd.end()) { + // still something to process + auto back_it = bd.end() - 1; + if (!back_it->is_compressed()) { + // it is thinkable to put the data to some after + uint32_t left_bound = p2align(data_end_offset, blob_size); + uint32_t right_bound = p2roundup(data_end_offset, blob_size); + exmp_it right_b = _find_mutable_blob_right( + after_punch_it, left_bound, right_bound, + data_end_offset - back_it->disk_data.length(), data_end_offset); + if (right_b != emap.end()) { + // Before putting last blob, put all previous; + // it is nicer to have AUs in order. + if (bd_it != back_it) { + // Last blob will be merged, we put blobs without the last. + _do_put_new_blobs(logical_offset, ref_end_offset, bd_it, back_it); + } + uint32_t data_begin_offset = data_end_offset - back_it->disk_data.length(); + uint32_t in_blob_offset = data_begin_offset - right_b->blob_start(); + _maybe_expand_blob(right_b->blob.get(), in_blob_offset + bd_it->disk_data.length()); + _blob_put_data_subau_allocate( + right_b->blob.get(), in_blob_offset, back_it->disk_data); + uint32_t ref_end = std::min(ref_end_offset, data_begin_offset + back_it->disk_data.length()); + //fixme - need something without stupid extras + right_b->blob->get_ref(coll, in_blob_offset, ref_end - data_begin_offset); + Extent *le = new Extent( + data_begin_offset, in_blob_offset, ref_end - data_begin_offset, right_b->blob); + dout(20) << __func__ << " new extent " << le->print(pp_mode) << dendl; + emap.insert(*le); + bd.erase(back_it); //TODO - or other way of limiting end + bstore->logger->inc(l_bluestore_write_small); + bstore->logger->inc(l_bluestore_write_small_bytes, le->length); + } + } + } + + // that's it about blob reuse, now is the time to full blobs + if (bd_it != bd.end()) { + _do_put_new_blobs(logical_offset, ref_end_offset, bd_it, bd.end()); + } +} + +/** + * The idea is to give us a chance to reuse blob. + * To do so, we must have enough to for block/csum/au. + * The decision is to either read or to pad with zeros. + * We return pair: + * first: true = pad with 0s, false = read the region + * second: new logical offset for data + * NOTE: Unlike _write_expand_r expanded punch_hole region + * is always equal to ref'ed region. + * NOTE2: This function can be called without split_at(logical_offset) + * NOTE3: If logical_offset is AU aligned, some blobs have larger csum. + * We ignore them, in result not even wanting to expand. + */ +std::pair<bool, uint32_t> BlueStore::Writer::_write_expand_l( + uint32_t logical_offset) +{ + uint32_t block_size = bstore->block_size; + uint32_t off_stop = p2align<uint32_t>(logical_offset, bstore->min_alloc_size); + // no need to go earlier then one AU + ceph_assert(off_stop != logical_offset); // to prevent superfluous invocation + uint32_t min_off = p2align(logical_offset, block_size); + uint32_t new_data_off = min_off; + bool new_data_pad = true; // unless otherwise stated, we pad + exmp_it it = onode->extent_map.seek_lextent(logical_offset); + // it can be extent in which we are interested in + if (it == onode->extent_map.extent_map.end() || + it->logical_offset >= logical_offset) { + if (it == onode->extent_map.extent_map.begin()) { + goto done; + } + --it; //step back to the first extent to consider + } + do { + if (it->logical_end() < off_stop) { + // Nothing before this point will be interesting. + // Not found blob to adapt to. + break; + } + if (!it->blob->get_blob().is_mutable()) { + new_data_pad = false; // we have to read data here + if (it == onode->extent_map.extent_map.begin()) break; + --it; + continue; + } + // we take first blob that we can + uint32_t can_off = p2align<uint32_t>(logical_offset, it->blob->get_blob().get_chunk_size(block_size)); + // ^smallest stop point that blob can accomodate + off_stop = can_off; + new_data_off = can_off; + // the blob is mapped, so it has space for at least up to begin of AU@logical_offset + if (it->logical_offset < logical_offset && logical_offset < it->logical_end()) { + // ^ this only works for the first extent we check + new_data_pad = false; + } else { + if (it->logical_end() <= can_off) { + // we have a fortunate area in blob that was mapped but not used + // the new_data_pad here depends on whether we have visited immutable blobs + } else { + // interested in using this blob, but there is data, must read + new_data_pad = false; + //^ read means we must expand punch_hole / ref, but not outside object size + } + } + } while ((it != onode->extent_map.extent_map.begin()) && (--it, true)); + done: + dout(25) << __func__ << std::hex << " logical_offset=0x" << logical_offset + << " -> 0x" << new_data_off << (new_data_pad ? " pad" : " read") << dendl; + return std::make_pair(new_data_pad, new_data_off); +} + +/** + * The idea is to give us a chance to reuse blob. + * To do so, we must have enough to for block/csum/au. + * The decision is to either read or to pad with zeros. + * We return pair: + * first: true = pad with 0s, false = read the region + * second: new end offset for data + * NOTE: When we pad with 0s, we do not expand ref range. + * When we read, we expand ref range. + * Ref range cannot to outside object size. + * NOTE2: This function can be called without split_at(end_offset) + * NOTE3: If logical_offset is AU aligned, some blobs have larger csum. + * We ignore them, in result not even wanting to expand. + */ +std::pair<bool, uint32_t> BlueStore::Writer::_write_expand_r( + uint32_t end_offset) +{ + uint32_t block_size = bstore->block_size; + uint32_t end_stop = p2roundup<uint32_t>(end_offset, bstore->min_alloc_size); + // no need to go further then one AU, since new blob it if happens can allocate one AU + ceph_assert(end_stop != end_offset); // to prevent superfluous invocation + uint32_t min_end = p2roundup(end_offset, block_size); + uint32_t new_data_end = min_end; + bool new_data_pad = true; // unless otherwise stated, we pad + exmp_it it = onode->extent_map.seek_lextent(end_offset); + for (; it != onode->extent_map.extent_map.end(); ++it) { + if (it->logical_offset >= end_stop) { + // nothing beyond this point is interesting + // no blob should have an free AU outside its logical mapping + // This is failure in reuse search. + break; + } + if (!it->blob->get_blob().is_mutable()) { + new_data_pad = false; //must read... + continue; + } + // if at end_offset is something then this blob certainly qualifies + // we take first blob that we can + uint32_t can_end = p2roundup<uint32_t>(end_offset, it->blob->get_blob().get_chunk_size(block_size)); + // ^smallest stop point that blob can accomodate + end_stop = can_end; + new_data_end = can_end; + // the blob is mapped, so it has space for at least up to end of AU@end_offset + if (it->logical_offset <= end_offset && end_offset < it->logical_end()) { + // ^ this only works for the first extent we check + new_data_pad = false; + //^ read means we must expand punch_hole / ref, but not outside object size + } else { + if (can_end <= it->logical_offset) { + // we have a fortunate area in blob that was mapped but not used + // the new_data_pad here depends on whether we have visited immutable blobs + } else { + // interested in using this blob, but there is data, must read + new_data_pad = false; + //^ read means we must expand punch_hole / ref, but not outside object size + } + } + } + dout(25) << __func__ << std::hex << " end_offset=0x" << end_offset + << " -> 0x" << new_data_end << (new_data_pad ? " pad" : " read") << dendl; + return std::make_pair(new_data_pad, new_data_end); +} + + + +// This function is a centralized place to make a decision on +// whether to use deferred or direct writes. +// The assumption behind it is that having parts of write executed as +// deferred and other parts as direct is suboptimal in any case. +void BlueStore::Writer::_defer_or_allocate(uint32_t need_size) +{ + // make a deferred decision + uint32_t released_size = 0; + for (const auto& r : released) { + released_size += r.length; + } + uint32_t au_size = bstore->min_alloc_size; + do_deferred = need_size <= released_size && released_size < bstore->prefer_deferred_size; + dout(15) << __func__ << " released=0x" << std::hex << released_size + << " need=0x" << need_size << std::dec + << (do_deferred ? " deferred" : " direct") << dendl; + + if (do_deferred) { + disk_allocs.it = released.begin(); + statfs_delta.allocated() += need_size; + disk_allocs.pos = 0; + } else { + int64_t new_alloc_size = bstore->alloc->allocate(need_size, au_size, 0, 0, &allocated); + ceph_assert(need_size == new_alloc_size); + statfs_delta.allocated() += new_alloc_size; + disk_allocs.it = allocated.begin(); + disk_allocs.pos = 0; + } +} + +// data (input) is split into chunks bd (output) +// data is emptied as a result +void BlueStore::Writer::_split_data( + uint32_t location, + bufferlist& data, + blob_vec& bd) +{ + ceph_assert(bd.empty()); + bd.reserve(data.length() / wctx->target_blob_size + 2); + auto lof = location; + uint32_t end_offset = location + data.length(); + while (lof < end_offset) { + uint32_t p = p2remain<uint32_t>(lof, wctx->target_blob_size); + if (p > end_offset - lof) p = end_offset - lof; + bufferlist tmp; + data.splice(0, p, &tmp); + bd.emplace_back(blob_data_t{p, 0, tmp, tmp}); + lof += p; + } +} + +void BlueStore::Writer::_align_to_disk_block( + uint32_t& location, + uint32_t& data_end, + blob_vec& blobs) +{ + ceph_assert(!blobs.empty()); + uint32_t au_size = bstore->min_alloc_size; + bool left_do_pad; + bool right_do_pad; + uint32_t left_location; + uint32_t right_location; + if (p2phase(location, au_size) != 0) { + blob_data_t& first_blob = blobs.front(); + if (!first_blob.is_compressed()) { + // try to make at least disk block aligned + std::tie(left_do_pad, left_location) = _write_expand_l(location); + if (left_location < location) { + bufferlist tmp; + if (left_do_pad) { + tmp.append_zero(location - left_location); + bstore->logger->inc(l_bluestore_write_pad_bytes, location - left_location); + } else { + tmp = _read_self(left_location, location - left_location); + } + tmp.claim_append(first_blob.disk_data); + first_blob.disk_data.swap(tmp); + first_blob.real_length += location - left_location; + location = left_location; + } + } + } + if (p2phase(data_end, au_size) != 0) { + blob_data_t& last_blob = blobs.back(); + if (!last_blob.is_compressed()) { + // try to make at least disk block aligned + std::tie(right_do_pad, right_location) = _write_expand_r(data_end); + if (data_end < right_location) { + // TODO - when we right-expand because of some blob csum restriction, it is possible + // we will be left-blob-csum-unaligned. It is wasted space. + // Think if we want to fix it. + if (right_do_pad) { + last_blob.disk_data.append_zero(right_location - data_end); + bstore->logger->inc(l_bluestore_write_pad_bytes, right_location - data_end); + } else { + bufferlist tmp; + tmp = _read_self(data_end, right_location - data_end); + last_blob.disk_data.append(tmp); + } + last_blob.real_length += right_location - data_end; + } + data_end = right_location; + } + } +} + +// Writes uncompressed data. +void BlueStore::Writer::do_write( + uint32_t location, + bufferlist& data) +{ + do_deferred = false; + disk_allocs.it = allocated.end(); + disk_allocs.pos = 0; + dout(20) << __func__ << " 0x" << std::hex << location << "~" << data.length() << dendl; + dout(25) << "on: " << onode->print(pp_mode) << dendl; + blob_vec bd; + uint32_t ref_end = location + data.length(); + uint32_t data_end = location + data.length(); + _split_data(location, data, bd); + _align_to_disk_block(location, data_end, bd); + if (ref_end < onode->onode.size) { + ref_end = std::min<uint32_t>(data_end, onode->onode.size); + } + dout(20) << "blobs to put:" << blob_data_printer(bd, location) << dendl; + statfs_delta.stored() += ref_end - location; + exmp_it after_punch_it = + bstore->_punch_hole_2(onode->c, onode, location, data_end - location, + released, pruned_blobs, txc->shared_blobs, statfs_delta); + dout(25) << "after punch_hole_2: " << std::endl << onode->print(pp_mode) << dendl; + + // todo: if we align to disk block before splitting, we could do it in one go + uint32_t pos = location; + for (auto& b : bd) { + bstore->_buffer_cache_write(this->txc, onode, pos, b.disk_data, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + pos += b.disk_data.length(); + } + ceph_assert(pos == data_end); + + uint32_t au_size = bstore->min_alloc_size; + if (au_size != bstore->block_size) { + _try_put_data_on_allocated(location, data_end, ref_end, bd, after_punch_it); + } + if (location != data_end) { + // make a deferred decision + uint32_t need_size = 0; + uint32_t location_tmp = location; + for (auto& i : bd) { + uint32_t location_end = location_tmp + i.real_length; + need_size += p2roundup(location_end, au_size) - p2align(location_tmp, au_size); + location_tmp = location_end; + } + + _defer_or_allocate(need_size); + _do_put_blobs(location, data_end, ref_end, bd, after_punch_it); + } else { + // Unlikely, but we just put everything. + ceph_assert(bd.size() == 0); + } + if (onode->onode.size < ref_end) + onode->onode.size = ref_end; + _collect_released_allocated(); + // update statfs + txc->statfs_delta += statfs_delta; + dout(25) << "result: " << std::endl << onode->print(pp_mode) << dendl; +} + +/** + * Move allocated and released regions to txc. + * NOTE: Consider in future to directly use variables in txc. + */ +void BlueStore::Writer::_collect_released_allocated() +{ + if (!do_deferred) { + // When we do direct all released is really released. + for (const auto& e : released) { + txc->released.insert(e.offset, e.length); + } + // We do not accept allocating more than really using later. + ceph_assert(disk_allocs.it == allocated.end()); + } else { + // When when we do deferred it is possible to not use all. + // Release the unused rest. + uint32_t pos = disk_allocs.pos; + while (disk_allocs.it != released.end()) { + auto& e = *disk_allocs.it; + dout(15) << "Deferred, some left unused location=0x" + << std::hex << e.offset + pos << "~" << e.length - pos << std::dec << dendl; + txc->released.insert(e.offset + pos, e.length - pos); + pos = 0; + ++disk_allocs.it; + } + } + for (auto e : allocated) { + txc->allocated.insert(e.offset, e.length); + } + released.clear(); + allocated.clear(); +} + +/** + * Debug function that extracts data from BufferSpace buffers. + * Typically it is useless - it is not guaranteed that buffers will not be evicted. + */ +void BlueStore::Writer::debug_iterate_buffers( + std::function<void(uint32_t offset, const bufferlist& data)> data_callback) +{ + for (const auto& b : onode->bc.buffer_map) { + data_callback(b.offset, b.data); + } +} |