diff options
author | Sage Weil <sage@redhat.com> | 2014-12-16 17:49:44 +0100 |
---|---|---|
committer | Sage Weil <sage@redhat.com> | 2014-12-16 17:49:44 +0100 |
commit | eef1705354d7661914dd09adc9627d52e03ac8dc (patch) | |
tree | 72156eafd769bfdfe9dd4e6e34aa2a8e08f7039a | |
parent | Merge pull request #3166 from ceph/wip-librados-fadvise (diff) | |
parent | mon/OSDMonitor: dump inc and full maps we get a crc mismatch (diff) | |
download | ceph-eef1705354d7661914dd09adc9627d52e03ac8dc.tar.xz ceph-eef1705354d7661914dd09adc9627d52e03ac8dc.zip |
Merge pull request #2300 from ceph/wip-osdmap
add osdmap checksums
sage-2014-12-16_06:27:10-rados:singleton-wip-sage-testing2-distro-basic-multi
sage-2014-12-15_13:18:23-rados:thrash-wip-sage-testing2-distro-basic-multi
Reviewed-by: Samuel Just <sjust@redhat.com>
-rw-r--r-- | src/common/config_opts.h | 1 | ||||
-rw-r--r-- | src/include/buffer.h | 2 | ||||
-rw-r--r-- | src/include/ceph_features.h | 20 | ||||
-rw-r--r-- | src/include/ceph_fs.h | 1 | ||||
-rw-r--r-- | src/messages/MMonGetOSDMap.h | 87 | ||||
-rw-r--r-- | src/messages/MOSDMap.h | 7 | ||||
-rw-r--r-- | src/messages/Makefile.am | 1 | ||||
-rw-r--r-- | src/mon/Monitor.cc | 1 | ||||
-rw-r--r-- | src/mon/OSDMonitor.cc | 113 | ||||
-rw-r--r-- | src/mon/OSDMonitor.h | 2 | ||||
-rw-r--r-- | src/msg/Message.cc | 4 | ||||
-rw-r--r-- | src/osd/OSD.cc | 39 | ||||
-rw-r--r-- | src/osd/OSDMap.cc | 130 | ||||
-rw-r--r-- | src/osd/OSDMap.h | 14 | ||||
-rw-r--r-- | src/test/encoding/ceph_dencoder.cc | 6 | ||||
-rw-r--r-- | src/test/encoding/types.h | 4 | ||||
-rw-r--r-- | src/tools/osdmaptool.cc | 2 |
17 files changed, 395 insertions, 39 deletions
diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 35eaf1975f2..102e937a2f0 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -495,6 +495,7 @@ OPTION(osd_map_max_advance, OPT_INT, 200) // make this < cache_size! OPTION(osd_map_cache_size, OPT_INT, 500) OPTION(osd_map_message_max, OPT_INT, 100) // max maps per MOSDMap message OPTION(osd_map_share_max_epochs, OPT_INT, 100) // cap on # of inc maps we send to peers, clients +OPTION(osd_inject_bad_map_crc_probability, OPT_FLOAT, 0) OPTION(osd_op_threads, OPT_INT, 2) // 0 == no threading OPTION(osd_peering_wq_batch_size, OPT_U64, 20) OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64, 4194304) diff --git a/src/include/buffer.h b/src/include/buffer.h index aac2f80a650..e26bfa03f9a 100644 --- a/src/include/buffer.h +++ b/src/include/buffer.h @@ -303,6 +303,8 @@ public: iterator& operator++(); ptr get_current_ptr(); + list& get_bl() { return *bl; } + // copy data out. // note that these all _append_ to dest! void copy(unsigned len, char *dest); diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h index a193d5eb93c..d9594a369e3 100644 --- a/src/include/ceph_features.h +++ b/src/include/ceph_features.h @@ -56,6 +56,10 @@ #define CEPH_FEATURE_OSD_SET_ALLOC_HINT (1ULL<<45) #define CEPH_FEATURE_OSD_FADVISE_FLAGS (1ULL<<46) +#define CEPH_FEATURE_RESERVED2 (1ULL<<61) /* slow down, we are almost out... */ +#define CEPH_FEATURE_RESERVED (1ULL<<62) /* DO NOT USE THIS ... last bit! */ +#define CEPH_FEATURE_RESERVED_BROKEN (1ULL<<63) /* DO NOT USE THIS; see below */ + /* * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature * vector to evaluate to 64 bit ~0. To cope, we designate 1ULL << 63 @@ -67,10 +71,10 @@ * and fixed by commit * 4255b5c2fb54ae40c53284b3ab700fdfc7e61748 v0.65-263-g4255b5c */ -#define CEPH_FEATURE_RESERVED (1ULL<<63) +#define CEPH_FEATURE_RESERVED_BROKEN (1ULL<<63) static inline unsigned long long ceph_sanitize_features(unsigned long long f) { - if (f & CEPH_FEATURE_RESERVED) { + if (f & CEPH_FEATURE_RESERVED_BROKEN) { /* everything through OSD_SNAPMAPPER */ return 0x1ffffffffull; } else { @@ -143,4 +147,16 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) { CEPH_FEATURE_CRUSH_TUNABLES3 | \ CEPH_FEATURE_CRUSH_V2) +/* + * make sure we don't try to use the reserved features + */ +#define CEPH_STATIC_ASSERT(x) (void)(sizeof(int[((x)==0) ? -1 : 0])) + +static inline void ____build_time_check_for_reserved_bits(void) { + CEPH_STATIC_ASSERT((CEPH_FEATURES_ALL & + (CEPH_FEATURE_RESERVED | + CEPH_FEATURE_RESERVED2 | + CEPH_FEATURE_RESERVED_BROKEN)) == 0); +} + #endif diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index c4776ea6bbd..33a6da141f6 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -93,6 +93,7 @@ struct ceph_dir_layout { /* client <-> monitor */ #define CEPH_MSG_MON_MAP 4 #define CEPH_MSG_MON_GET_MAP 5 +#define CEPH_MSG_MON_GET_OSDMAP 6 #define CEPH_MSG_STATFS 13 #define CEPH_MSG_STATFS_REPLY 14 #define CEPH_MSG_MON_SUBSCRIBE 15 diff --git a/src/messages/MMonGetOSDMap.h b/src/messages/MMonGetOSDMap.h new file mode 100644 index 00000000000..859d253f631 --- /dev/null +++ b/src/messages/MMonGetOSDMap.h @@ -0,0 +1,87 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MMONGETOSDMAP_H +#define CEPH_MMONGETOSDMAP_H + +#include "msg/Message.h" + +#include "include/types.h" + +class MMonGetOSDMap : public PaxosServiceMessage { + epoch_t full_first, full_last; + epoch_t inc_first, inc_last; + +public: + MMonGetOSDMap() + : PaxosServiceMessage(CEPH_MSG_MON_GET_OSDMAP, 0), + full_first(0), + full_last(0), + inc_first(0), + inc_last(0) { } +private: + ~MMonGetOSDMap() {} + +public: + void request_full(epoch_t first, epoch_t last) { + assert(last >= first); + full_first = first; + full_last = last; + } + void request_inc(epoch_t first, epoch_t last) { + assert(last >= first); + inc_first = first; + inc_last = last; + } + epoch_t get_full_first() const { + return full_first; + } + epoch_t get_full_last() const { + return full_last; + } + epoch_t get_inc_first() const { + return inc_first; + } + epoch_t get_inc_last() const { + return inc_last; + } + + const char *get_type_name() const { return "mon_get_osdmap"; } + void print(ostream& out) const { + out << "mon_get_osdmap("; + if (full_first && full_last) + out << "full " << full_first << "-" << full_last; + if (inc_first && inc_last) + out << " inc" << inc_first << "-" << inc_last; + out << ")"; + } + + void encode_payload(uint64_t features) { + paxos_encode(); + ::encode(full_first, payload); + ::encode(full_last, payload); + ::encode(inc_first, payload); + ::encode(inc_last, payload); + } + void decode_payload() { + bufferlist::iterator p = payload.begin(); + paxos_decode(p); + ::decode(full_first, p); + ::decode(full_last, p); + ::decode(inc_first, p); + ::decode(inc_last, p); + } +}; + +#endif diff --git a/src/messages/MOSDMap.h b/src/messages/MOSDMap.h index 7ed601c6b36..9c15290b7e9 100644 --- a/src/messages/MOSDMap.h +++ b/src/messages/MOSDMap.h @@ -57,13 +57,10 @@ class MOSDMap : public Message { MOSDMap() : Message(CEPH_MSG_OSD_MAP, HEAD_VERSION) { } - MOSDMap(const uuid_d &f, OSDMap *oc=0) + MOSDMap(const uuid_d &f) : Message(CEPH_MSG_OSD_MAP, HEAD_VERSION), fsid(f), - oldest_map(0), newest_map(0) { - if (oc) - oc->encode(maps[oc->get_epoch()]); - } + oldest_map(0), newest_map(0) { } private: ~MOSDMap() {} diff --git a/src/messages/Makefile.am b/src/messages/Makefile.am index cac40482b00..38593f7e724 100644 --- a/src/messages/Makefile.am +++ b/src/messages/Makefile.am @@ -56,6 +56,7 @@ noinst_HEADERS += \ messages/MMonCommandAck.h \ messages/MMonElection.h \ messages/MMonGetMap.h \ + messages/MMonGetOSDMap.h \ messages/MMonGetVersion.h \ messages/MMonGetVersionReply.h \ messages/MMonGlobalID.h \ diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 4c01800057d..52753a62fb4 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -3173,6 +3173,7 @@ void Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon) switch (m->get_type()) { // OSDs + case CEPH_MSG_MON_GET_OSDMAP: case MSG_OSD_MARK_ME_DOWN: case MSG_OSD_FAILURE: case MSG_OSD_BOOT: diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 67066e6c180..5f41ea875d8 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -31,6 +31,7 @@ #include "messages/MOSDFailure.h" #include "messages/MOSDMarkMeDown.h" #include "messages/MOSDMap.h" +#include "messages/MMonGetOSDMap.h" #include "messages/MOSDBoot.h" #include "messages/MOSDAlive.h" #include "messages/MPoolOp.h" @@ -109,7 +110,9 @@ void OSDMonitor::create_initial() newmap.created = newmap.modified = ceph_clock_now(g_ceph_context); // encode into pending incremental - newmap.encode(pending_inc.fullmap, mon->quorum_features); + newmap.encode(pending_inc.fullmap, mon->quorum_features | CEPH_FEATURE_RESERVED); + pending_inc.full_crc = newmap.get_crc(); + dout(20) << " full crc " << pending_inc.full_crc << dendl; } void OSDMonitor::update_from_paxos(bool *need_bootstrap) @@ -217,9 +220,37 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap) if (!f) f = -1; bufferlist full_bl; - osdmap.encode(full_bl, f); + osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED); tx_size += full_bl.length(); + // verify the crc is as expected + if (inc.have_crc && + inc.full_crc != osdmap.crc) { + derr << "inc for epoch " << osdmap.get_epoch() << " has full_crc " + << inc.full_crc << " but actual is " << osdmap.crc + << " features " << f + << dendl; + + derr << "full map dump (crc " << full_bl.crc32c(-1) << "):\n"; + full_bl.hexdump(*_dout); + *_dout << "\ninc map dump (crc " << inc_bl.crc32c(-1) << "):\n"; + inc_bl.hexdump(*_dout); + *_dout << dendl; + bufferlist prev_bl, prev_bl2, again_bl; + get_version_full(osdmap.epoch-1, prev_bl); + OSDMap pristine; + pristine.decode(prev_bl); + pristine.encode(prev_bl2, f | CEPH_FEATURE_RESERVED); + derr << "previous osdmap reload, raw crc " << prev_bl.crc32c(-1) << dendl; + derr << "reencode of that is " << prev_bl2.crc32c(-1) << dendl; + pristine.apply_incremental(inc); + pristine.encode(again_bl, f | CEPH_FEATURE_RESERVED); + derr << "again raw crc is " << again_bl.crc32c(-1) << dendl; + derr << " full_crc " << pristine.crc << dendl; + + assert(0 == "got mismatched crc encoding full map"); + } + put_version_full(t, osdmap.epoch, full_bl); put_version_latest_full(t, osdmap.epoch); @@ -642,9 +673,22 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t) } } + // determine the new map's crc + OSDMap tmp; + { + tmp.deepish_copy_from(osdmap); + tmp.apply_incremental(pending_inc); + bufferlist t; + ::encode(tmp, t, mon->quorum_features | CEPH_FEATURE_RESERVED); + pending_inc.full_crc = tmp.get_crc(); + } + // encode assert(get_last_committed() + 1 == pending_inc.epoch); - ::encode(pending_inc, bl, mon->quorum_features); + ::encode(pending_inc, bl, mon->quorum_features | CEPH_FEATURE_RESERVED); + + dout(20) << " full_crc " << tmp.get_crc() + << " inc_crc " << pending_inc.inc_crc << dendl; /* put everything in the transaction */ put_version(t, pending_inc.epoch, bl); @@ -746,6 +790,8 @@ bool OSDMonitor::preprocess_query(PaxosServiceMessage *m) // READs case MSG_MON_COMMAND: return preprocess_command(static_cast<MMonCommand*>(m)); + case CEPH_MSG_MON_GET_OSDMAP: + return preprocess_get_osdmap(static_cast<MMonGetOSDMap*>(m)); // damp updates case MSG_OSD_MARK_ME_DOWN: @@ -832,6 +878,32 @@ bool OSDMonitor::should_propose(double& delay) // --------------------------- // READs +bool OSDMonitor::preprocess_get_osdmap(MMonGetOSDMap *m) +{ + dout(10) << __func__ << " " << *m << dendl; + MOSDMap *reply = new MOSDMap(mon->monmap->fsid); + epoch_t first = get_first_committed(); + epoch_t last = osdmap.get_epoch(); + int max = g_conf->osd_map_message_max; + for (epoch_t e = MAX(first, m->get_full_first()); + e < MIN(last, m->get_full_last()) && max > 0; + ++e, --max) { + int r = get_version_full(e, reply->maps[e]); + assert(r >= 0); + } + for (epoch_t e = MAX(first, m->get_inc_first()); + e < MIN(last, m->get_inc_last()) && max > 0; + ++e, --max) { + int r = get_version(e, reply->incremental_maps[e]); + assert(r >= 0); + } + reply->oldest_map = get_first_committed(); + reply->newest_map = osdmap.get_epoch(); + mon->send_reply(m, reply); + m->put(); + return true; +} + // --------------------------- // UPDATEs @@ -1716,7 +1788,8 @@ void OSDMonitor::send_latest(PaxosServiceMessage *m, epoch_t start) MOSDMap *OSDMonitor::build_latest_full() { - MOSDMap *r = new MOSDMap(mon->monmap->fsid, &osdmap); + MOSDMap *r = new MOSDMap(mon->monmap->fsid); + get_version_full(osdmap.get_epoch(), r->maps[osdmap.get_epoch()]); r->oldest_map = get_first_committed(); r->newest_map = osdmap.get_epoch(); return r; @@ -2308,21 +2381,27 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) int64_t epochnum; cmd_getval(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)0); epoch = epochnum; + if (!epoch) + epoch = osdmap.get_epoch(); - OSDMap *p = &osdmap; - if (epoch) { - bufferlist b; - int err = get_version_full(epoch, b); - if (err == -ENOENT) { - r = -ENOENT; - ss << "there is no map for epoch " << epoch; - goto reply; - } - assert(err == 0); - assert(b.length()); + bufferlist osdmap_bl; + int err = get_version_full(epoch, osdmap_bl); + if (err == -ENOENT) { + r = -ENOENT; + ss << "there is no map for epoch " << epoch; + goto reply; + } + assert(err == 0); + assert(osdmap_bl.length()); + + OSDMap *p; + if (epoch == osdmap.get_epoch()) { + p = &osdmap; + } else { p = new OSDMap; - p->decode(b); + p->decode(osdmap_bl); } + if (prefix == "osd dump") { stringstream ds; if (f) { @@ -2369,7 +2448,7 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) } rdata.append(ds); } else if (prefix == "osd getmap") { - p->encode(rdata, m->get_connection()->get_features()); + rdata.append(osdmap_bl); ss << "got osdmap epoch " << p->get_epoch(); } else if (prefix == "osd getcrushmap") { p->crush->encode(rdata); diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 411650dcddc..3f9d578e4ad 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -224,6 +224,8 @@ private: bool check_source(PaxosServiceMessage *m, uuid_d fsid); + bool preprocess_get_osdmap(class MMonGetOSDMap *m); + bool preprocess_mark_me_down(class MOSDMarkMeDown *m); friend class C_AckMarkedDown; diff --git a/src/msg/Message.cc b/src/msg/Message.cc index 7b1e331f086..ecda9a20b66 100644 --- a/src/msg/Message.cc +++ b/src/msg/Message.cc @@ -66,6 +66,7 @@ using namespace std; #include "messages/MOSDSubOp.h" #include "messages/MOSDSubOpReply.h" #include "messages/MOSDMap.h" +#include "messages/MMonGetOSDMap.h" #include "messages/MOSDPGNotify.h" #include "messages/MOSDPGQuery.h" @@ -376,6 +377,9 @@ Message *decode_message(CephContext *cct, ceph_msg_header& header, ceph_msg_foot case CEPH_MSG_MON_GET_MAP: m = new MMonGetMap; break; + case CEPH_MSG_MON_GET_OSDMAP: + m = new MMonGetOSDMap; + break; case CEPH_MSG_MON_GET_VERSION: m = new MMonGetVersion(); break; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 157661488a4..014e2ab48a0 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -71,6 +71,7 @@ #include "messages/MOSDPGTemp.h" #include "messages/MOSDMap.h" +#include "messages/MMonGetOSDMap.h" #include "messages/MOSDPGNotify.h" #include "messages/MOSDPGQuery.h" #include "messages/MOSDPGLog.h" @@ -6022,9 +6023,6 @@ void OSD::handle_osd_map(MOSDMap *m) return; } - // even if this map isn't from a mon, we may have satisfied our subscription - monc->sub_got("osdmap", last); - // missing some? bool skip_maps = false; if (first > osdmap->get_epoch() + 1) { @@ -6083,8 +6081,7 @@ void OSD::handle_osd_map(MOSDMap *m) OSDMap *o = new OSDMap; if (e > 1) { bufferlist obl; - OSDMapRef prev = get_map(e - 1); - prev->encode(obl); + get_map_bl(e - 1, obl); o->decode(obl); } @@ -6100,7 +6097,27 @@ void OSD::handle_osd_map(MOSDMap *m) last_marked_full = e; bufferlist fbl; - o->encode(fbl); + o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED); + + bool injected_failure = false; + if (g_conf->osd_inject_bad_map_crc_probability > 0 && + (rand() % 10000) < g_conf->osd_inject_bad_map_crc_probability*10000.0) { + derr << __func__ << " injecting map crc failure" << dendl; + injected_failure = true; + } + + if (o->get_crc() != inc.full_crc || injected_failure) { + dout(2) << "got incremental " << e + << " but failed to encode full with correct crc; requesting" + << dendl; + clog->warn() << "failed to encode map e" << e << " with expected crc\n"; + MMonGetOSDMap *req = new MMonGetOSDMap; + req->request_full(e, last); + monc->send_mon_message(req); + last = e - 1; + break; + } + hobject_t fulloid = get_osdmap_pobject_name(e); t.write(META_COLL, fulloid, 0, fbl.length(), fbl); @@ -6112,6 +6129,16 @@ void OSD::handle_osd_map(MOSDMap *m) assert(0 == "MOSDMap lied about what maps it had?"); } + // even if this map isn't from a mon, we may have satisfied our subscription + monc->sub_got("osdmap", last); + + if (last <= osdmap->get_epoch()) { + dout(10) << " no new maps here, dropping" << dendl; + delete _t; + m->put(); + return; + } + if (superblock.oldest_map) { int num = 0; epoch_t min( diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 3af187ba744..864d5542f8a 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -396,8 +396,19 @@ void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const return; } + // only a select set of callers should *ever* be encoding new + // OSDMaps. others should be passing around the canonical encoded + // buffers from on high. select out those callers by passing in an + // "impossible" feature bit. + assert(features & CEPH_FEATURE_RESERVED); + features &= ~CEPH_FEATURE_RESERVED; + + size_t start_offset = bl.length(); + size_t tail_offset; + buffer::list::iterator crc_it; + // meta-encoding: how we include client-used and osd-specific data - ENCODE_START(7, 7, bl); + ENCODE_START(8, 7, bl); { ENCODE_START(3, 1, bl); // client-usable data @@ -441,8 +452,26 @@ void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const ENCODE_FINISH(bl); // osd-only data } + ::encode((uint32_t)0, bl); // dummy inc_crc + crc_it = bl.end(); + crc_it.advance(-4); + tail_offset = bl.length(); + + ::encode(full_crc, bl); + ENCODE_FINISH(bl); // meta-encoding wrapper + // fill in crc + bufferlist front; + front.substr_of(bl, start_offset, crc_it.get_off() - start_offset); + inc_crc = front.crc32c(-1); + bufferlist tail; + tail.substr_of(bl, tail_offset, bl.length() - tail_offset); + inc_crc = tail.crc32c(inc_crc); + ceph_le32 crc_le; + crc_le = inc_crc; + crc_it.copy_in(4, (char*)&crc_le); + have_crc = true; } void OSDMap::Incremental::decode_classic(bufferlist::iterator &p) @@ -547,7 +576,11 @@ void OSDMap::Incremental::decode(bufferlist::iterator& bl) * a struct_v < 7, we must rewind to the beginning and use our * classic decoder. */ - DECODE_START_LEGACY_COMPAT_LEN(7, 7, 7, bl); // wrapper + size_t start_offset = bl.get_off(); + size_t tail_offset = 0; + bufferlist crc_front, crc_tail; + + DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper if (struct_v < 7) { int struct_v_size = sizeof(struct_v); bl.advance(-struct_v_size); @@ -608,7 +641,35 @@ void OSDMap::Incremental::decode(bufferlist::iterator& bl) DECODE_FINISH(bl); // osd-only data } + if (struct_v >= 8) { + have_crc = true; + crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset); + ::decode(inc_crc, bl); + tail_offset = bl.get_off(); + ::decode(full_crc, bl); + } else { + have_crc = false; + full_crc = 0; + inc_crc = 0; + } + DECODE_FINISH(bl); // wrapper + + if (have_crc) { + // verify crc + uint32_t actual = crc_front.crc32c(-1); + if (tail_offset < bl.get_off()) { + bufferlist tail; + tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset); + actual = tail.crc32c(actual); + } + if (inc_crc != actual) { + ostringstream ss; + ss << "bad crc, actual " << actual << " != expected " << inc_crc; + string s = ss.str(); + throw buffer::malformed_input(s.c_str()); + } + } } void OSDMap::Incremental::dump(Formatter *f) const @@ -1786,8 +1847,20 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const encode_classic(bl, features); return; } + + // only a select set of callers should *ever* be encoding new + // OSDMaps. others should be passing around the canonical encoded + // buffers from on high. select out those callers by passing in an + // "impossible" feature bit. + assert(features & CEPH_FEATURE_RESERVED); + features &= ~CEPH_FEATURE_RESERVED; + + size_t start_offset = bl.length(); + size_t tail_offset; + buffer::list::iterator crc_it; + // meta-encoding: how we include client-used and osd-specific data - ENCODE_START(7, 7, bl); + ENCODE_START(8, 7, bl); { ENCODE_START(3, 1, bl); // client-usable data @@ -1847,7 +1920,26 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const ENCODE_FINISH(bl); // osd-only data } + ::encode((uint32_t)0, bl); // dummy crc + crc_it = bl.end(); + crc_it.advance(-4); + tail_offset = bl.length(); + ENCODE_FINISH(bl); // meta-encoding wrapper + + // fill in crc + bufferlist front; + front.substr_of(bl, start_offset, crc_it.get_off() - start_offset); + crc = front.crc32c(-1); + if (tail_offset < bl.length()) { + bufferlist tail; + tail.substr_of(bl, tail_offset, bl.length() - tail_offset); + crc = tail.crc32c(crc); + } + ceph_le32 crc_le; + crc_le = crc; + crc_it.copy_in(4, (char*)&crc_le); + crc_defined = true; } void OSDMap::decode(bufferlist& bl) @@ -1976,7 +2068,11 @@ void OSDMap::decode(bufferlist::iterator& bl) * a struct_v < 7, we must rewind to the beginning and use our * classic decoder. */ - DECODE_START_LEGACY_COMPAT_LEN(7, 7, 7, bl); // wrapper + size_t start_offset = bl.get_off(); + size_t tail_offset = 0; + bufferlist crc_front, crc_tail; + + DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper if (struct_v < 7) { int struct_v_size = sizeof(struct_v); bl.advance(-struct_v_size); @@ -2043,8 +2139,34 @@ void OSDMap::decode(bufferlist::iterator& bl) DECODE_FINISH(bl); // osd-only data } + if (struct_v >= 8) { + crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset); + ::decode(crc, bl); + tail_offset = bl.get_off(); + crc_defined = true; + } else { + crc_defined = false; + crc = 0; + } + DECODE_FINISH(bl); // wrapper + if (tail_offset) { + // verify crc + uint32_t actual = crc_front.crc32c(-1); + if (tail_offset < bl.get_off()) { + bufferlist tail; + tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset); + actual = tail.crc32c(actual); + } + if (crc != actual) { + ostringstream ss; + ss << "bad crc, actual " << actual << " != expected " << crc; + string s = ss.str(); + throw buffer::malformed_input(s.c_str()); + } + } + post_decode(); } diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 7d440affd12..1792ea69706 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -155,6 +155,10 @@ public: string cluster_snapshot; + mutable bool have_crc; ///< crc values are defined + uint32_t full_crc; ///< crc of the resulting OSDMap + mutable uint32_t inc_crc; ///< crc of this incremental + int get_net_marked_out(const OSDMap *previous) const; int get_net_marked_down(const OSDMap *previous) const; int identify_osd(uuid_d u) const; @@ -169,7 +173,8 @@ public: Incremental(epoch_t e=0) : encode_features(0), - epoch(e), new_pool_max(-1), new_flags(-1), new_max_osd(-1) { + epoch(e), new_pool_max(-1), new_flags(-1), new_max_osd(-1), + have_crc(false), full_crc(0), inc_crc(0) { memset(&fsid, 0, sizeof(fsid)); } Incremental(bufferlist &bl) { @@ -240,7 +245,13 @@ private: string cluster_snapshot; bool new_blacklist_entries; + mutable bool crc_defined; + mutable uint32_t crc; + public: + bool have_crc() const { return crc_defined; } + uint32_t get_crc() const { return crc; } + ceph::shared_ptr<CrushWrapper> crush; // hierarchical map friend class OSDMonitor; @@ -258,6 +269,7 @@ private: osd_uuid(new vector<uuid_d>), cluster_snapshot_epoch(0), new_blacklist_entries(false), + crc_defined(false), crc(0), crush(new CrushWrapper) { memset(&fsid, 0, sizeof(fsid)); } diff --git a/src/test/encoding/ceph_dencoder.cc b/src/test/encoding/ceph_dencoder.cc index 8391f33cb2f..d5068a1a4d4 100644 --- a/src/test/encoding/ceph_dencoder.cc +++ b/src/test/encoding/ceph_dencoder.cc @@ -12,12 +12,14 @@ #define TYPE(t) #define TYPEWITHSTRAYDATA(t) #define TYPE_FEATUREFUL(t) +#define TYPE_FEATUREFUL_STRAYDATA(t) #define TYPE_NOCOPY(t) #define MESSAGE(t) #include "types.h" #undef TYPE #undef TYPEWITHSTRAYDATA #undef TYPE_FEATUREFUL +#undef TYPE_FEATUREFUL_STRAYDATA #undef TYPE_NOCOPY #undef MESSAGE @@ -239,12 +241,14 @@ int main(int argc, const char **argv) #define TYPE(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeature<t>(false); #define TYPEWITHSTRAYDATA(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeature<t>(true); #define TYPE_FEATUREFUL(t) dencoders[T_STRINGIFY(t)] = new DencoderImplFeatureful<t>(false); +#define TYPE_FEATUREFUL_STRAYDATA(t) dencoders[T_STRINGIFY(t)] = new DencoderImplFeatureful<t>(true); #define TYPE_NOCOPY(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeatureNoCopy<t>(false); #define MESSAGE(t) dencoders[T_STRINGIFY(t)] = new MessageDencoderImpl<t>; #include "types.h" #undef TYPE #undef TYPEWITHSTRAYDATA #undef TYPE_FEATUREFUL +#undef TYPE_FEATUREFUL_STRAYDATA #undef T_STR #undef T_STRINGIFY @@ -312,7 +316,7 @@ int main(int argc, const char **argv) usage(cerr); exit(1); } - den->encode(encbl, features); + den->encode(encbl, features | CEPH_FEATURE_RESERVED); // hack for OSDMap } else if (*i == string("decode")) { if (!den) { cerr << "must first select type with 'type <name>'" << std::endl; diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h index 59a576e4521..d6934e95c7b 100644 --- a/src/test/encoding/types.h +++ b/src/test/encoding/types.h @@ -30,8 +30,8 @@ TYPE(entity_addr_t) #include "osd/OSDMap.h" TYPE(osd_info_t) TYPE(osd_xinfo_t) -TYPEWITHSTRAYDATA(OSDMap) -TYPEWITHSTRAYDATA(OSDMap::Incremental) +TYPE_FEATUREFUL_STRAYDATA(OSDMap) +TYPE_FEATUREFUL_STRAYDATA(OSDMap::Incremental) #include "crush/CrushWrapper.h" TYPE_NOCOPY(CrushWrapper) diff --git a/src/tools/osdmaptool.cc b/src/tools/osdmaptool.cc index d84aeb8661c..a52eb6cae97 100644 --- a/src/tools/osdmaptool.cc +++ b/src/tools/osdmaptool.cc @@ -463,7 +463,7 @@ int main(int argc, const char **argv) if (modified) { bl.clear(); - osdmap.encode(bl); + osdmap.encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT | CEPH_FEATURE_RESERVED); // write it out cout << me << ": writing epoch " << osdmap.get_epoch() |