diff options
author | Sage Weil <sage@newdream.net> | 2017-02-15 23:20:18 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-02-15 23:20:18 +0100 |
commit | eb491a13dc2faee315bf894fc2043aacfb94d624 (patch) | |
tree | 35a331693017ca804002be0c64a9a4d88faf5c90 /src/osdc | |
parent | Merge pull request #13445 from liewegas/wip-rgw-thrash (diff) | |
parent | osd: fix backoff vs reset race (diff) | |
download | ceph-eb491a13dc2faee315bf894fc2043aacfb94d624.tar.xz ceph-eb491a13dc2faee315bf894fc2043aacfb94d624.zip |
Merge pull request #13235 from liewegas/wip-pg-split-interval
osd: have clients resend ops on pg split
Reviewed-by: Greg Farnum <gfarnum@redhat.com>
Reviewed-by: Josh Durgin <jdurgin@redhat.com>
Reviewed-by: Samuel Just <sjust@redhat.com>
Diffstat (limited to 'src/osdc')
-rw-r--r-- | src/osdc/Objecter.cc | 189 | ||||
-rw-r--r-- | src/osdc/Objecter.h | 15 |
2 files changed, 119 insertions, 85 deletions
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index ad578a81d1b..d83bcf17b50 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -857,7 +857,7 @@ void Objecter::_linger_submit(LingerOp *info, shunique_lock& sul) // Populate Op::target OSDSession *s = NULL; - _calc_target(&info->target); + _calc_target(&info->target, nullptr); // Create LingerOp<->OSDSession relation int r = _get_session(info->target.osd, &s, sul); @@ -1069,7 +1069,8 @@ void Objecter::_scan_requests(OSDSession *s, if (pool_full_map) force_resend_writes = force_resend_writes || (*pool_full_map)[op->target.base_oloc.pool]; - int r = _calc_target(&op->target); + int r = _calc_target(&op->target, + op->session ? op->session->con.get() : nullptr); switch (r) { case RECALC_OP_TARGET_NO_ACTION: if (!force_resend && @@ -1297,7 +1298,7 @@ void Objecter::handle_osd_map(MOSDMap *m) p != need_resend_linger.end(); ++p) { LingerOp *op = *p; if (!op->session) { - _calc_target(&op->target); + _calc_target(&op->target, nullptr); OSDSession *s = NULL; int const r = _get_session(op->target.osd, &s, sul); assert(r == 0); @@ -2292,7 +2293,7 @@ void Objecter::_op_submit(Op *op, shunique_lock& sul, ceph_tid_t *ptid) assert(op->session == NULL); OSDSession *s = NULL; - bool check_for_latest_map = _calc_target(&op->target) + bool check_for_latest_map = _calc_target(&op->target, nullptr) == RECALC_OP_TARGET_POOL_DNE; // Try to get a session, including a retry if we need to take write lock @@ -2309,7 +2310,7 @@ void Objecter::_op_submit(Op *op, shunique_lock& sul, ceph_tid_t *ptid) // map changed; recalculate mapping ldout(cct, 10) << __func__ << " relock raced with osdmap, recalc target" << dendl; - check_for_latest_map = _calc_target(&op->target) + check_for_latest_map = _calc_target(&op->target, nullptr) == RECALC_OP_TARGET_POOL_DNE; if (s) { put_session(s); @@ -2655,21 +2656,27 @@ int64_t Objecter::get_object_pg_hash_position(int64_t pool, const string& key, return p->raw_hash_to_pg(p->hash_key(key, ns)); } -int Objecter::_calc_target(op_target_t *t, bool any_change) +int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change) { // rwlock is locked - bool is_read = t->flags & CEPH_OSD_FLAG_READ; bool is_write = t->flags & CEPH_OSD_FLAG_WRITE; + ldout(cct,20) << __func__ << " base " << t->base_oid << " " << t->base_oloc + << " precalc_pgid " << (int)t->precalc_pgid + << " pgid " << t->base_pgid + << (is_read ? " is_read" : "") + << (is_write ? " is_write" : "") + << dendl; const pg_pool_t *pi = osdmap->get_pg_pool(t->base_oloc.pool); if (!pi) { t->osd = -1; return RECALC_OP_TARGET_POOL_DNE; } + ldout(cct,30) << __func__ << " base pi " << pi + << " pg_num " << pi->get_pg_num() << dendl; bool force_resend = false; - bool need_check_tiering = false; if (osdmap->get_epoch() == pi->last_force_op_resend) { if (t->last_force_resend < pi->last_force_op_resend) { t->last_force_resend = pi->last_force_op_resend; @@ -2678,41 +2685,28 @@ int Objecter::_calc_target(op_target_t *t, bool any_change) force_resend = true; } } - if (t->target_oid.name.empty() || force_resend) { - t->target_oid = t->base_oid; - need_check_tiering = true; - } - if (t->target_oloc.empty() || force_resend) { - t->target_oloc = t->base_oloc; - need_check_tiering = true; - } - if (need_check_tiering && - (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { + // apply tiering + t->target_oid = t->base_oid; + t->target_oloc = t->base_oloc; + if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { if (is_read && pi->has_read_tier()) t->target_oloc.pool = pi->read_tier; if (is_write && pi->has_write_tier()) t->target_oloc.pool = pi->write_tier; + pi = osdmap->get_pg_pool(t->target_oloc.pool); + if (!pi) { + t->osd = -1; + return RECALC_OP_TARGET_POOL_DNE; + } } pg_t pgid; if (t->precalc_pgid) { - assert(t->base_oid.name.empty()); // make sure this is a listing op - ldout(cct, 10) << __func__ << " have " << t->base_pgid << " pool " - << osdmap->have_pg_pool(t->base_pgid.pool()) << dendl; - if (!osdmap->have_pg_pool(t->base_pgid.pool())) { - t->osd = -1; - return RECALC_OP_TARGET_POOL_DNE; - } - if (osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) { - // if the SORTBITWISE flag is set, we know all OSDs are running - // jewel+. - pgid = t->base_pgid; - } else { - // legacy behavior. pre-jewel OSDs will fail if we send a - // full-hash pgid value. - pgid = osdmap->raw_pg_to_pg(t->base_pgid); - } + assert(t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY); + assert(t->base_oid.name.empty()); // make sure this is a pg op + assert(t->base_oloc.pool == (int64_t)t->base_pgid.pool()); + pgid = t->base_pgid; } else { int ret = osdmap->object_locator_to_pg(t->target_oid, t->target_oloc, pgid); @@ -2721,6 +2715,10 @@ int Objecter::_calc_target(op_target_t *t, bool any_change) return RECALC_OP_TARGET_POOL_DNE; } } + ldout(cct,20) << __func__ << " target " << t->target_oid << " " + << t->target_oloc << " -> pgid " << pgid << dendl; + ldout(cct,30) << __func__ << " target pi " << pi + << " pg_num " << pi->get_pg_num() << dendl; int size = pi->size; int min_size = pi->min_size; @@ -2731,6 +2729,7 @@ int Objecter::_calc_target(op_target_t *t, bool any_change) &acting, &acting_primary); bool sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE); unsigned prev_seed = ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask); + pg_t prev_pgid(prev_seed, pgid.pool()); if (any_change && pg_interval_t::is_new_interval( t->acting_primary, acting_primary, @@ -2748,7 +2747,7 @@ int Objecter::_calc_target(op_target_t *t, bool any_change) pg_num, t->sort_bitwise, sort_bitwise, - pg_t(prev_seed, pgid.pool(), pgid.preferred()))) { + prev_pgid)) { force_resend = true; } @@ -2759,11 +2758,12 @@ int Objecter::_calc_target(op_target_t *t, bool any_change) t->paused = false; need_resend = true; } - if (t->pgid != pgid || is_pg_changed( t->acting_primary, t->acting, acting_primary, acting, t->used_replica || any_change) || + (con && con->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) && + prev_pgid.is_split(t->pg_num, pg_num, nullptr)) || force_resend) { t->pgid = pgid; t->acting = acting; @@ -2774,9 +2774,14 @@ int Objecter::_calc_target(op_target_t *t, bool any_change) t->min_size = min_size; t->pg_num = pg_num; t->pg_num_mask = pi->get_pg_num_mask(); + osdmap->get_primary_shard( + pg_t(ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask), pgid.pool()), + &t->actual_pgid); t->sort_bitwise = sort_bitwise; ldout(cct, 10) << __func__ << " " - << " pgid " << pgid << " acting " << acting << dendl; + << " raw pgid " << pgid << " -> actual " << t->actual_pgid + << " acting " << acting + << " primary " << acting_primary << dendl; t->used_replica = false; if (acting_primary == -1) { t->osd = -1; @@ -2830,7 +2835,7 @@ int Objecter::_calc_target(op_target_t *t, bool any_change) int Objecter::_map_session(op_target_t *target, OSDSession **s, shunique_lock& sul) { - _calc_target(target); + _calc_target(target, nullptr); return _get_session(target->osd, s, sul); } @@ -2939,8 +2944,7 @@ int Objecter::_recalc_linger_op_target(LingerOp *linger_op, { // rwlock is locked unique - int r = _calc_target(&linger_op->target, - true); + int r = _calc_target(&linger_op->target, nullptr, true); if (r == RECALC_OP_TARGET_NEED_RESEND) { ldout(cct, 10) << "recalc_linger_op_target tid " << linger_op->linger_id << " pgid " << linger_op->target.pgid @@ -3033,9 +3037,9 @@ MOSDOp *Objecter::_prepare_osd_op(Op *op) op->target.paused = false; op->stamp = ceph::mono_clock::now(); + hobject_t hobj = op->target.get_hobj(); MOSDOp *m = new MOSDOp(client_inc.read(), op->tid, - op->target.target_oid, op->target.target_oloc, - op->target.pgid, + hobj, op->target.actual_pgid, osdmap->get_epoch(), flags, op->features); @@ -3047,9 +3051,6 @@ MOSDOp *Objecter::_prepare_osd_op(Op *op) m->set_mtime(op->mtime); m->set_retry_attempt(op->attempts++); - if (op->replay_version != eversion_t()) - m->set_version(op->replay_version); // we're replaying this op! - if (op->priority) m->set_priority(op->priority); else @@ -3072,21 +3073,25 @@ void Objecter::_send_op(Op *op, MOSDOp *m) // backoff? hobject_t hoid = op->target.get_hobj(); - auto q = op->session->backoffs.lower_bound(hoid); - if (q != op->session->backoffs.begin()) { - --q; - if (hoid >= q->second.end) { - ++q; + auto p = op->session->backoffs.find(op->target.actual_pgid); + if (p != op->session->backoffs.end()) { + auto q = p->second.lower_bound(hoid); + if (q != p->second.begin()) { + --q; + if (hoid >= q->second.end) { + ++q; + } } - } - if (q != op->session->backoffs.end()) { - ldout(cct, 20) << __func__ << " ? " << q->first << " [" << q->second.begin - << "," << q->second.end << ")" << dendl; - int r = cmp(hoid, q->second.begin); - if (r == 0 || (r > 0 && hoid < q->second.end)) { - ldout(cct, 10) << __func__ << " backoff on " << hoid << ", queuing " - << op << " tid " << op->tid << dendl; - return; + if (q != p->second.end()) { + ldout(cct, 20) << __func__ << " ? " << q->first << " [" << q->second.begin + << "," << q->second.end << ")" << dendl; + int r = cmp(hoid, q->second.begin); + if (r == 0 || (r > 0 && hoid < q->second.end)) { + ldout(cct, 10) << __func__ << " backoff " << op->target.actual_pgid + << " id " << q->second.id << " on " << hoid + << ", queuing " << op << " tid " << op->tid << dendl; + return; + } } } @@ -3095,7 +3100,16 @@ void Objecter::_send_op(Op *op, MOSDOp *m) m = _prepare_osd_op(op); } - ldout(cct, 15) << "_send_op " << op->tid << " to osd." << op->session->osd + if (op->target.actual_pgid != m->get_spg()) { + ldout(cct, 10) << __func__ << " " << op->tid << " pgid change from " + << m->get_spg() << " to " << op->target.actual_pgid + << ", updating and reencoding" << dendl; + m->set_spg(op->target.actual_pgid); + m->clear_payload(); // reencode + } + + ldout(cct, 15) << "_send_op " << op->tid << " to " + << op->target.actual_pgid << " on osd." << op->session->osd << dendl; ConnectionRef con = op->session->con; @@ -3223,8 +3237,7 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m) ldout(cct, 7) << "handle_osd_op_reply " << tid << (m->is_ondisk() ? " ondisk" : (m->is_onnvram() ? " onnvram" : " ack")) - << " v " << m->get_replay_version() << " uv " - << m->get_user_version() + << " uv " << m->get_user_version() << " in " << m->get_pg() << " attempt " << m->get_retry_attempt() << dendl; @@ -3414,16 +3427,19 @@ void Objecter::handle_osd_backoff(MOSDBackoff *m) case CEPH_OSD_BACKOFF_OP_BLOCK: { // register - OSDBackoff& b = s->backoffs[m->begin]; + OSDBackoff& b = s->backoffs[m->pgid][m->begin]; s->backoffs_by_id.insert(make_pair(m->id, &b)); + b.pgid = m->pgid; b.id = m->id; b.begin = m->begin; b.end = m->end; - // ack - Message *r = new MOSDBackoff(CEPH_OSD_BACKOFF_OP_ACK_BLOCK, - m->id, m->begin, m->end, - osdmap->get_epoch()); + // ack with original backoff's epoch so that the osd can discard this if + // there was a pg split. + Message *r = new MOSDBackoff(m->pgid, + m->map_epoch, + CEPH_OSD_BACKOFF_OP_ACK_BLOCK, + m->id, m->begin, m->end); // this priority must match the MOSDOps from _prepare_osd_op r->set_priority(cct->_conf->osd_client_op_priority); con->send_message(r); @@ -3433,31 +3449,43 @@ void Objecter::handle_osd_backoff(MOSDBackoff *m) case CEPH_OSD_BACKOFF_OP_UNBLOCK: { auto p = s->backoffs_by_id.find(m->id); - while (p != s->backoffs_by_id.end() && - p->second->id == m->id) { + if (p != s->backoffs_by_id.end()) { OSDBackoff *b = p->second; if (b->begin != m->begin && b->end != m->end) { - lderr(cct) << __func__ << " got id " << m->id << " unblock on [" + lderr(cct) << __func__ << " got " << m->pgid << " id " << m->id + << " unblock on [" << m->begin << "," << m->end << ") but backoff is [" << b->begin << "," << b->end << ")" << dendl; // hrmpf, unblock it anyway. } - ldout(cct, 10) << __func__ << " unblock backoff " << b->id + ldout(cct, 10) << __func__ << " unblock backoff " << b->pgid + << " id " << b->id << " [" << b->begin << "," << b->end << ")" << dendl; - s->backoffs.erase(b->begin); - p = s->backoffs_by_id.erase(p); + auto spgp = s->backoffs.find(b->pgid); + assert(spgp != s->backoffs.end()); + spgp->second.erase(b->begin); + if (spgp->second.empty()) { + s->backoffs.erase(spgp); + } + s->backoffs_by_id.erase(p); // check for any ops to resend for (auto& q : s->ops) { - int r = q.second->target.contained_by(m->begin, m->end); - ldout(cct, 20) << __func__ << " contained_by " << r << " on " - << q.second->target.get_hobj() << dendl; - if (r) { - _send_op(q.second); + if (q.second->target.actual_pgid == m->pgid) { + int r = q.second->target.contained_by(m->begin, m->end); + ldout(cct, 20) << __func__ << " contained_by " << r << " on " + << q.second->target.get_hobj() << dendl; + if (r) { + _send_op(q.second); + } } } + } else { + lderr(cct) << __func__ << " " << m->pgid << " id " << m->id + << " unblock on [" + << m->begin << "," << m->end << ") but backoff dne" << dendl; } } break; @@ -4629,6 +4657,9 @@ int Objecter::_calc_command_target(CommandOp *c, shunique_lock& sul) c->map_check_error = 0; + // ignore overlays, just like we do with pg ops + c->target.flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY; + if (c->target_osd >= 0) { if (!osdmap->exists(c->target_osd)) { c->map_check_error = -ENOENT; @@ -4642,7 +4673,7 @@ int Objecter::_calc_command_target(CommandOp *c, shunique_lock& sul) } c->target.osd = c->target_osd; } else { - int ret = _calc_target(&(c->target), true); + int ret = _calc_target(&(c->target), nullptr, true); if (ret == RECALC_OP_TARGET_POOL_DNE) { c->map_check_error = -ENOENT; c->map_check_error_str = "pool dne"; diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index 8a0a45c3cd7..fc8dce0304c 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -1193,7 +1193,8 @@ public: ///< explcit pg target, if any pg_t base_pgid; - pg_t pgid; ///< last pg we mapped to + pg_t pgid; ///< last (raw) pg we mapped to + spg_t actual_pgid; ///< last (actual) spg_t we mapped to unsigned pg_num = 0; ///< last pg_num we mapped to unsigned pg_num_mask = 0; ///< last pg_num_mask we mapped to vector<int> up; ///< set of up osds for last pg we mapped to @@ -1268,7 +1269,6 @@ public: uint64_t ontimeout; ceph_tid_t tid; - eversion_t replay_version; // for op replay int attempts; version_t *objver; @@ -1708,6 +1708,7 @@ public: // -- osd sessions -- struct OSDBackoff { + spg_t pgid; uint64_t id; hobject_t begin, end; }; @@ -1725,8 +1726,8 @@ public: map<ceph_tid_t,CommandOp*> command_ops; // backoffs - map<hobject_t,OSDBackoff> backoffs; - multimap<uint64_t,OSDBackoff*> backoffs_by_id; + map<spg_t,map<hobject_t,OSDBackoff>> backoffs; + map<uint64_t,OSDBackoff*> backoffs_by_id; int osd; int incarnation; @@ -1810,7 +1811,7 @@ public: bool _osdmap_has_pool_full() const; bool target_should_be_paused(op_target_t *op); - int _calc_target(op_target_t *t, + int _calc_target(op_target_t *t, Connection *con, bool any_change = false); int _map_session(op_target_t *op, OSDSession **s, shunique_lock& lc); @@ -2192,7 +2193,9 @@ public: Context *onack, epoch_t *reply_epoch, int *ctx_budget) { Op *o = new Op(object_t(), oloc, - op.ops, flags | global_op_flags.read() | CEPH_OSD_FLAG_READ, + op.ops, + flags | global_op_flags.read() | CEPH_OSD_FLAG_READ | + CEPH_OSD_FLAG_IGNORE_OVERLAY, onack, NULL); o->target.precalc_pgid = true; o->target.base_pgid = pg_t(hash, oloc.pool); |