summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSage Weil <sage@redhat.com>2014-12-16 17:49:44 +0100
committerSage Weil <sage@redhat.com>2014-12-16 17:49:44 +0100
commiteef1705354d7661914dd09adc9627d52e03ac8dc (patch)
tree72156eafd769bfdfe9dd4e6e34aa2a8e08f7039a
parentMerge pull request #3166 from ceph/wip-librados-fadvise (diff)
parentmon/OSDMonitor: dump inc and full maps we get a crc mismatch (diff)
downloadceph-eef1705354d7661914dd09adc9627d52e03ac8dc.tar.xz
ceph-eef1705354d7661914dd09adc9627d52e03ac8dc.zip
Merge pull request #2300 from ceph/wip-osdmap
add osdmap checksums sage-2014-12-16_06:27:10-rados:singleton-wip-sage-testing2-distro-basic-multi sage-2014-12-15_13:18:23-rados:thrash-wip-sage-testing2-distro-basic-multi Reviewed-by: Samuel Just <sjust@redhat.com>
-rw-r--r--src/common/config_opts.h1
-rw-r--r--src/include/buffer.h2
-rw-r--r--src/include/ceph_features.h20
-rw-r--r--src/include/ceph_fs.h1
-rw-r--r--src/messages/MMonGetOSDMap.h87
-rw-r--r--src/messages/MOSDMap.h7
-rw-r--r--src/messages/Makefile.am1
-rw-r--r--src/mon/Monitor.cc1
-rw-r--r--src/mon/OSDMonitor.cc113
-rw-r--r--src/mon/OSDMonitor.h2
-rw-r--r--src/msg/Message.cc4
-rw-r--r--src/osd/OSD.cc39
-rw-r--r--src/osd/OSDMap.cc130
-rw-r--r--src/osd/OSDMap.h14
-rw-r--r--src/test/encoding/ceph_dencoder.cc6
-rw-r--r--src/test/encoding/types.h4
-rw-r--r--src/tools/osdmaptool.cc2
17 files changed, 395 insertions, 39 deletions
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 35eaf1975f2..102e937a2f0 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -495,6 +495,7 @@ OPTION(osd_map_max_advance, OPT_INT, 200) // make this < cache_size!
OPTION(osd_map_cache_size, OPT_INT, 500)
OPTION(osd_map_message_max, OPT_INT, 100) // max maps per MOSDMap message
OPTION(osd_map_share_max_epochs, OPT_INT, 100) // cap on # of inc maps we send to peers, clients
+OPTION(osd_inject_bad_map_crc_probability, OPT_FLOAT, 0)
OPTION(osd_op_threads, OPT_INT, 2) // 0 == no threading
OPTION(osd_peering_wq_batch_size, OPT_U64, 20)
OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64, 4194304)
diff --git a/src/include/buffer.h b/src/include/buffer.h
index aac2f80a650..e26bfa03f9a 100644
--- a/src/include/buffer.h
+++ b/src/include/buffer.h
@@ -303,6 +303,8 @@ public:
iterator& operator++();
ptr get_current_ptr();
+ list& get_bl() { return *bl; }
+
// copy data out.
// note that these all _append_ to dest!
void copy(unsigned len, char *dest);
diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h
index a193d5eb93c..d9594a369e3 100644
--- a/src/include/ceph_features.h
+++ b/src/include/ceph_features.h
@@ -56,6 +56,10 @@
#define CEPH_FEATURE_OSD_SET_ALLOC_HINT (1ULL<<45)
#define CEPH_FEATURE_OSD_FADVISE_FLAGS (1ULL<<46)
+#define CEPH_FEATURE_RESERVED2 (1ULL<<61) /* slow down, we are almost out... */
+#define CEPH_FEATURE_RESERVED (1ULL<<62) /* DO NOT USE THIS ... last bit! */
+#define CEPH_FEATURE_RESERVED_BROKEN (1ULL<<63) /* DO NOT USE THIS; see below */
+
/*
* The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
* vector to evaluate to 64 bit ~0. To cope, we designate 1ULL << 63
@@ -67,10 +71,10 @@
* and fixed by commit
* 4255b5c2fb54ae40c53284b3ab700fdfc7e61748 v0.65-263-g4255b5c
*/
-#define CEPH_FEATURE_RESERVED (1ULL<<63)
+#define CEPH_FEATURE_RESERVED_BROKEN (1ULL<<63)
static inline unsigned long long ceph_sanitize_features(unsigned long long f) {
- if (f & CEPH_FEATURE_RESERVED) {
+ if (f & CEPH_FEATURE_RESERVED_BROKEN) {
/* everything through OSD_SNAPMAPPER */
return 0x1ffffffffull;
} else {
@@ -143,4 +147,16 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) {
CEPH_FEATURE_CRUSH_TUNABLES3 | \
CEPH_FEATURE_CRUSH_V2)
+/*
+ * make sure we don't try to use the reserved features
+ */
+#define CEPH_STATIC_ASSERT(x) (void)(sizeof(int[((x)==0) ? -1 : 0]))
+
+static inline void ____build_time_check_for_reserved_bits(void) {
+ CEPH_STATIC_ASSERT((CEPH_FEATURES_ALL &
+ (CEPH_FEATURE_RESERVED |
+ CEPH_FEATURE_RESERVED2 |
+ CEPH_FEATURE_RESERVED_BROKEN)) == 0);
+}
+
#endif
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index c4776ea6bbd..33a6da141f6 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -93,6 +93,7 @@ struct ceph_dir_layout {
/* client <-> monitor */
#define CEPH_MSG_MON_MAP 4
#define CEPH_MSG_MON_GET_MAP 5
+#define CEPH_MSG_MON_GET_OSDMAP 6
#define CEPH_MSG_STATFS 13
#define CEPH_MSG_STATFS_REPLY 14
#define CEPH_MSG_MON_SUBSCRIBE 15
diff --git a/src/messages/MMonGetOSDMap.h b/src/messages/MMonGetOSDMap.h
new file mode 100644
index 00000000000..859d253f631
--- /dev/null
+++ b/src/messages/MMonGetOSDMap.h
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MMONGETOSDMAP_H
+#define CEPH_MMONGETOSDMAP_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MMonGetOSDMap : public PaxosServiceMessage {
+ epoch_t full_first, full_last;
+ epoch_t inc_first, inc_last;
+
+public:
+ MMonGetOSDMap()
+ : PaxosServiceMessage(CEPH_MSG_MON_GET_OSDMAP, 0),
+ full_first(0),
+ full_last(0),
+ inc_first(0),
+ inc_last(0) { }
+private:
+ ~MMonGetOSDMap() {}
+
+public:
+ void request_full(epoch_t first, epoch_t last) {
+ assert(last >= first);
+ full_first = first;
+ full_last = last;
+ }
+ void request_inc(epoch_t first, epoch_t last) {
+ assert(last >= first);
+ inc_first = first;
+ inc_last = last;
+ }
+ epoch_t get_full_first() const {
+ return full_first;
+ }
+ epoch_t get_full_last() const {
+ return full_last;
+ }
+ epoch_t get_inc_first() const {
+ return inc_first;
+ }
+ epoch_t get_inc_last() const {
+ return inc_last;
+ }
+
+ const char *get_type_name() const { return "mon_get_osdmap"; }
+ void print(ostream& out) const {
+ out << "mon_get_osdmap(";
+ if (full_first && full_last)
+ out << "full " << full_first << "-" << full_last;
+ if (inc_first && inc_last)
+ out << " inc" << inc_first << "-" << inc_last;
+ out << ")";
+ }
+
+ void encode_payload(uint64_t features) {
+ paxos_encode();
+ ::encode(full_first, payload);
+ ::encode(full_last, payload);
+ ::encode(inc_first, payload);
+ ::encode(inc_last, payload);
+ }
+ void decode_payload() {
+ bufferlist::iterator p = payload.begin();
+ paxos_decode(p);
+ ::decode(full_first, p);
+ ::decode(full_last, p);
+ ::decode(inc_first, p);
+ ::decode(inc_last, p);
+ }
+};
+
+#endif
diff --git a/src/messages/MOSDMap.h b/src/messages/MOSDMap.h
index 7ed601c6b36..9c15290b7e9 100644
--- a/src/messages/MOSDMap.h
+++ b/src/messages/MOSDMap.h
@@ -57,13 +57,10 @@ class MOSDMap : public Message {
MOSDMap() : Message(CEPH_MSG_OSD_MAP, HEAD_VERSION) { }
- MOSDMap(const uuid_d &f, OSDMap *oc=0)
+ MOSDMap(const uuid_d &f)
: Message(CEPH_MSG_OSD_MAP, HEAD_VERSION),
fsid(f),
- oldest_map(0), newest_map(0) {
- if (oc)
- oc->encode(maps[oc->get_epoch()]);
- }
+ oldest_map(0), newest_map(0) { }
private:
~MOSDMap() {}
diff --git a/src/messages/Makefile.am b/src/messages/Makefile.am
index cac40482b00..38593f7e724 100644
--- a/src/messages/Makefile.am
+++ b/src/messages/Makefile.am
@@ -56,6 +56,7 @@ noinst_HEADERS += \
messages/MMonCommandAck.h \
messages/MMonElection.h \
messages/MMonGetMap.h \
+ messages/MMonGetOSDMap.h \
messages/MMonGetVersion.h \
messages/MMonGetVersionReply.h \
messages/MMonGlobalID.h \
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 4c01800057d..52753a62fb4 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -3173,6 +3173,7 @@ void Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon)
switch (m->get_type()) {
// OSDs
+ case CEPH_MSG_MON_GET_OSDMAP:
case MSG_OSD_MARK_ME_DOWN:
case MSG_OSD_FAILURE:
case MSG_OSD_BOOT:
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 67066e6c180..5f41ea875d8 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -31,6 +31,7 @@
#include "messages/MOSDFailure.h"
#include "messages/MOSDMarkMeDown.h"
#include "messages/MOSDMap.h"
+#include "messages/MMonGetOSDMap.h"
#include "messages/MOSDBoot.h"
#include "messages/MOSDAlive.h"
#include "messages/MPoolOp.h"
@@ -109,7 +110,9 @@ void OSDMonitor::create_initial()
newmap.created = newmap.modified = ceph_clock_now(g_ceph_context);
// encode into pending incremental
- newmap.encode(pending_inc.fullmap, mon->quorum_features);
+ newmap.encode(pending_inc.fullmap, mon->quorum_features | CEPH_FEATURE_RESERVED);
+ pending_inc.full_crc = newmap.get_crc();
+ dout(20) << " full crc " << pending_inc.full_crc << dendl;
}
void OSDMonitor::update_from_paxos(bool *need_bootstrap)
@@ -217,9 +220,37 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
if (!f)
f = -1;
bufferlist full_bl;
- osdmap.encode(full_bl, f);
+ osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
tx_size += full_bl.length();
+ // verify the crc is as expected
+ if (inc.have_crc &&
+ inc.full_crc != osdmap.crc) {
+ derr << "inc for epoch " << osdmap.get_epoch() << " has full_crc "
+ << inc.full_crc << " but actual is " << osdmap.crc
+ << " features " << f
+ << dendl;
+
+ derr << "full map dump (crc " << full_bl.crc32c(-1) << "):\n";
+ full_bl.hexdump(*_dout);
+ *_dout << "\ninc map dump (crc " << inc_bl.crc32c(-1) << "):\n";
+ inc_bl.hexdump(*_dout);
+ *_dout << dendl;
+ bufferlist prev_bl, prev_bl2, again_bl;
+ get_version_full(osdmap.epoch-1, prev_bl);
+ OSDMap pristine;
+ pristine.decode(prev_bl);
+ pristine.encode(prev_bl2, f | CEPH_FEATURE_RESERVED);
+ derr << "previous osdmap reload, raw crc " << prev_bl.crc32c(-1) << dendl;
+ derr << "reencode of that is " << prev_bl2.crc32c(-1) << dendl;
+ pristine.apply_incremental(inc);
+ pristine.encode(again_bl, f | CEPH_FEATURE_RESERVED);
+ derr << "again raw crc is " << again_bl.crc32c(-1) << dendl;
+ derr << " full_crc " << pristine.crc << dendl;
+
+ assert(0 == "got mismatched crc encoding full map");
+ }
+
put_version_full(t, osdmap.epoch, full_bl);
put_version_latest_full(t, osdmap.epoch);
@@ -642,9 +673,22 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
}
}
+ // determine the new map's crc
+ OSDMap tmp;
+ {
+ tmp.deepish_copy_from(osdmap);
+ tmp.apply_incremental(pending_inc);
+ bufferlist t;
+ ::encode(tmp, t, mon->quorum_features | CEPH_FEATURE_RESERVED);
+ pending_inc.full_crc = tmp.get_crc();
+ }
+
// encode
assert(get_last_committed() + 1 == pending_inc.epoch);
- ::encode(pending_inc, bl, mon->quorum_features);
+ ::encode(pending_inc, bl, mon->quorum_features | CEPH_FEATURE_RESERVED);
+
+ dout(20) << " full_crc " << tmp.get_crc()
+ << " inc_crc " << pending_inc.inc_crc << dendl;
/* put everything in the transaction */
put_version(t, pending_inc.epoch, bl);
@@ -746,6 +790,8 @@ bool OSDMonitor::preprocess_query(PaxosServiceMessage *m)
// READs
case MSG_MON_COMMAND:
return preprocess_command(static_cast<MMonCommand*>(m));
+ case CEPH_MSG_MON_GET_OSDMAP:
+ return preprocess_get_osdmap(static_cast<MMonGetOSDMap*>(m));
// damp updates
case MSG_OSD_MARK_ME_DOWN:
@@ -832,6 +878,32 @@ bool OSDMonitor::should_propose(double& delay)
// ---------------------------
// READs
+bool OSDMonitor::preprocess_get_osdmap(MMonGetOSDMap *m)
+{
+ dout(10) << __func__ << " " << *m << dendl;
+ MOSDMap *reply = new MOSDMap(mon->monmap->fsid);
+ epoch_t first = get_first_committed();
+ epoch_t last = osdmap.get_epoch();
+ int max = g_conf->osd_map_message_max;
+ for (epoch_t e = MAX(first, m->get_full_first());
+ e < MIN(last, m->get_full_last()) && max > 0;
+ ++e, --max) {
+ int r = get_version_full(e, reply->maps[e]);
+ assert(r >= 0);
+ }
+ for (epoch_t e = MAX(first, m->get_inc_first());
+ e < MIN(last, m->get_inc_last()) && max > 0;
+ ++e, --max) {
+ int r = get_version(e, reply->incremental_maps[e]);
+ assert(r >= 0);
+ }
+ reply->oldest_map = get_first_committed();
+ reply->newest_map = osdmap.get_epoch();
+ mon->send_reply(m, reply);
+ m->put();
+ return true;
+}
+
// ---------------------------
// UPDATEs
@@ -1716,7 +1788,8 @@ void OSDMonitor::send_latest(PaxosServiceMessage *m, epoch_t start)
MOSDMap *OSDMonitor::build_latest_full()
{
- MOSDMap *r = new MOSDMap(mon->monmap->fsid, &osdmap);
+ MOSDMap *r = new MOSDMap(mon->monmap->fsid);
+ get_version_full(osdmap.get_epoch(), r->maps[osdmap.get_epoch()]);
r->oldest_map = get_first_committed();
r->newest_map = osdmap.get_epoch();
return r;
@@ -2308,21 +2381,27 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
int64_t epochnum;
cmd_getval(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)0);
epoch = epochnum;
+ if (!epoch)
+ epoch = osdmap.get_epoch();
- OSDMap *p = &osdmap;
- if (epoch) {
- bufferlist b;
- int err = get_version_full(epoch, b);
- if (err == -ENOENT) {
- r = -ENOENT;
- ss << "there is no map for epoch " << epoch;
- goto reply;
- }
- assert(err == 0);
- assert(b.length());
+ bufferlist osdmap_bl;
+ int err = get_version_full(epoch, osdmap_bl);
+ if (err == -ENOENT) {
+ r = -ENOENT;
+ ss << "there is no map for epoch " << epoch;
+ goto reply;
+ }
+ assert(err == 0);
+ assert(osdmap_bl.length());
+
+ OSDMap *p;
+ if (epoch == osdmap.get_epoch()) {
+ p = &osdmap;
+ } else {
p = new OSDMap;
- p->decode(b);
+ p->decode(osdmap_bl);
}
+
if (prefix == "osd dump") {
stringstream ds;
if (f) {
@@ -2369,7 +2448,7 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
}
rdata.append(ds);
} else if (prefix == "osd getmap") {
- p->encode(rdata, m->get_connection()->get_features());
+ rdata.append(osdmap_bl);
ss << "got osdmap epoch " << p->get_epoch();
} else if (prefix == "osd getcrushmap") {
p->crush->encode(rdata);
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index 411650dcddc..3f9d578e4ad 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -224,6 +224,8 @@ private:
bool check_source(PaxosServiceMessage *m, uuid_d fsid);
+ bool preprocess_get_osdmap(class MMonGetOSDMap *m);
+
bool preprocess_mark_me_down(class MOSDMarkMeDown *m);
friend class C_AckMarkedDown;
diff --git a/src/msg/Message.cc b/src/msg/Message.cc
index 7b1e331f086..ecda9a20b66 100644
--- a/src/msg/Message.cc
+++ b/src/msg/Message.cc
@@ -66,6 +66,7 @@ using namespace std;
#include "messages/MOSDSubOp.h"
#include "messages/MOSDSubOpReply.h"
#include "messages/MOSDMap.h"
+#include "messages/MMonGetOSDMap.h"
#include "messages/MOSDPGNotify.h"
#include "messages/MOSDPGQuery.h"
@@ -376,6 +377,9 @@ Message *decode_message(CephContext *cct, ceph_msg_header& header, ceph_msg_foot
case CEPH_MSG_MON_GET_MAP:
m = new MMonGetMap;
break;
+ case CEPH_MSG_MON_GET_OSDMAP:
+ m = new MMonGetOSDMap;
+ break;
case CEPH_MSG_MON_GET_VERSION:
m = new MMonGetVersion();
break;
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 157661488a4..014e2ab48a0 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -71,6 +71,7 @@
#include "messages/MOSDPGTemp.h"
#include "messages/MOSDMap.h"
+#include "messages/MMonGetOSDMap.h"
#include "messages/MOSDPGNotify.h"
#include "messages/MOSDPGQuery.h"
#include "messages/MOSDPGLog.h"
@@ -6022,9 +6023,6 @@ void OSD::handle_osd_map(MOSDMap *m)
return;
}
- // even if this map isn't from a mon, we may have satisfied our subscription
- monc->sub_got("osdmap", last);
-
// missing some?
bool skip_maps = false;
if (first > osdmap->get_epoch() + 1) {
@@ -6083,8 +6081,7 @@ void OSD::handle_osd_map(MOSDMap *m)
OSDMap *o = new OSDMap;
if (e > 1) {
bufferlist obl;
- OSDMapRef prev = get_map(e - 1);
- prev->encode(obl);
+ get_map_bl(e - 1, obl);
o->decode(obl);
}
@@ -6100,7 +6097,27 @@ void OSD::handle_osd_map(MOSDMap *m)
last_marked_full = e;
bufferlist fbl;
- o->encode(fbl);
+ o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
+
+ bool injected_failure = false;
+ if (g_conf->osd_inject_bad_map_crc_probability > 0 &&
+ (rand() % 10000) < g_conf->osd_inject_bad_map_crc_probability*10000.0) {
+ derr << __func__ << " injecting map crc failure" << dendl;
+ injected_failure = true;
+ }
+
+ if (o->get_crc() != inc.full_crc || injected_failure) {
+ dout(2) << "got incremental " << e
+ << " but failed to encode full with correct crc; requesting"
+ << dendl;
+ clog->warn() << "failed to encode map e" << e << " with expected crc\n";
+ MMonGetOSDMap *req = new MMonGetOSDMap;
+ req->request_full(e, last);
+ monc->send_mon_message(req);
+ last = e - 1;
+ break;
+ }
+
hobject_t fulloid = get_osdmap_pobject_name(e);
t.write(META_COLL, fulloid, 0, fbl.length(), fbl);
@@ -6112,6 +6129,16 @@ void OSD::handle_osd_map(MOSDMap *m)
assert(0 == "MOSDMap lied about what maps it had?");
}
+ // even if this map isn't from a mon, we may have satisfied our subscription
+ monc->sub_got("osdmap", last);
+
+ if (last <= osdmap->get_epoch()) {
+ dout(10) << " no new maps here, dropping" << dendl;
+ delete _t;
+ m->put();
+ return;
+ }
+
if (superblock.oldest_map) {
int num = 0;
epoch_t min(
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 3af187ba744..864d5542f8a 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -396,8 +396,19 @@ void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const
return;
}
+ // only a select set of callers should *ever* be encoding new
+ // OSDMaps. others should be passing around the canonical encoded
+ // buffers from on high. select out those callers by passing in an
+ // "impossible" feature bit.
+ assert(features & CEPH_FEATURE_RESERVED);
+ features &= ~CEPH_FEATURE_RESERVED;
+
+ size_t start_offset = bl.length();
+ size_t tail_offset;
+ buffer::list::iterator crc_it;
+
// meta-encoding: how we include client-used and osd-specific data
- ENCODE_START(7, 7, bl);
+ ENCODE_START(8, 7, bl);
{
ENCODE_START(3, 1, bl); // client-usable data
@@ -441,8 +452,26 @@ void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const
ENCODE_FINISH(bl); // osd-only data
}
+ ::encode((uint32_t)0, bl); // dummy inc_crc
+ crc_it = bl.end();
+ crc_it.advance(-4);
+ tail_offset = bl.length();
+
+ ::encode(full_crc, bl);
+
ENCODE_FINISH(bl); // meta-encoding wrapper
+ // fill in crc
+ bufferlist front;
+ front.substr_of(bl, start_offset, crc_it.get_off() - start_offset);
+ inc_crc = front.crc32c(-1);
+ bufferlist tail;
+ tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
+ inc_crc = tail.crc32c(inc_crc);
+ ceph_le32 crc_le;
+ crc_le = inc_crc;
+ crc_it.copy_in(4, (char*)&crc_le);
+ have_crc = true;
}
void OSDMap::Incremental::decode_classic(bufferlist::iterator &p)
@@ -547,7 +576,11 @@ void OSDMap::Incremental::decode(bufferlist::iterator& bl)
* a struct_v < 7, we must rewind to the beginning and use our
* classic decoder.
*/
- DECODE_START_LEGACY_COMPAT_LEN(7, 7, 7, bl); // wrapper
+ size_t start_offset = bl.get_off();
+ size_t tail_offset = 0;
+ bufferlist crc_front, crc_tail;
+
+ DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
if (struct_v < 7) {
int struct_v_size = sizeof(struct_v);
bl.advance(-struct_v_size);
@@ -608,7 +641,35 @@ void OSDMap::Incremental::decode(bufferlist::iterator& bl)
DECODE_FINISH(bl); // osd-only data
}
+ if (struct_v >= 8) {
+ have_crc = true;
+ crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
+ ::decode(inc_crc, bl);
+ tail_offset = bl.get_off();
+ ::decode(full_crc, bl);
+ } else {
+ have_crc = false;
+ full_crc = 0;
+ inc_crc = 0;
+ }
+
DECODE_FINISH(bl); // wrapper
+
+ if (have_crc) {
+ // verify crc
+ uint32_t actual = crc_front.crc32c(-1);
+ if (tail_offset < bl.get_off()) {
+ bufferlist tail;
+ tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
+ actual = tail.crc32c(actual);
+ }
+ if (inc_crc != actual) {
+ ostringstream ss;
+ ss << "bad crc, actual " << actual << " != expected " << inc_crc;
+ string s = ss.str();
+ throw buffer::malformed_input(s.c_str());
+ }
+ }
}
void OSDMap::Incremental::dump(Formatter *f) const
@@ -1786,8 +1847,20 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
encode_classic(bl, features);
return;
}
+
+ // only a select set of callers should *ever* be encoding new
+ // OSDMaps. others should be passing around the canonical encoded
+ // buffers from on high. select out those callers by passing in an
+ // "impossible" feature bit.
+ assert(features & CEPH_FEATURE_RESERVED);
+ features &= ~CEPH_FEATURE_RESERVED;
+
+ size_t start_offset = bl.length();
+ size_t tail_offset;
+ buffer::list::iterator crc_it;
+
// meta-encoding: how we include client-used and osd-specific data
- ENCODE_START(7, 7, bl);
+ ENCODE_START(8, 7, bl);
{
ENCODE_START(3, 1, bl); // client-usable data
@@ -1847,7 +1920,26 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
ENCODE_FINISH(bl); // osd-only data
}
+ ::encode((uint32_t)0, bl); // dummy crc
+ crc_it = bl.end();
+ crc_it.advance(-4);
+ tail_offset = bl.length();
+
ENCODE_FINISH(bl); // meta-encoding wrapper
+
+ // fill in crc
+ bufferlist front;
+ front.substr_of(bl, start_offset, crc_it.get_off() - start_offset);
+ crc = front.crc32c(-1);
+ if (tail_offset < bl.length()) {
+ bufferlist tail;
+ tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
+ crc = tail.crc32c(crc);
+ }
+ ceph_le32 crc_le;
+ crc_le = crc;
+ crc_it.copy_in(4, (char*)&crc_le);
+ crc_defined = true;
}
void OSDMap::decode(bufferlist& bl)
@@ -1976,7 +2068,11 @@ void OSDMap::decode(bufferlist::iterator& bl)
* a struct_v < 7, we must rewind to the beginning and use our
* classic decoder.
*/
- DECODE_START_LEGACY_COMPAT_LEN(7, 7, 7, bl); // wrapper
+ size_t start_offset = bl.get_off();
+ size_t tail_offset = 0;
+ bufferlist crc_front, crc_tail;
+
+ DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
if (struct_v < 7) {
int struct_v_size = sizeof(struct_v);
bl.advance(-struct_v_size);
@@ -2043,8 +2139,34 @@ void OSDMap::decode(bufferlist::iterator& bl)
DECODE_FINISH(bl); // osd-only data
}
+ if (struct_v >= 8) {
+ crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
+ ::decode(crc, bl);
+ tail_offset = bl.get_off();
+ crc_defined = true;
+ } else {
+ crc_defined = false;
+ crc = 0;
+ }
+
DECODE_FINISH(bl); // wrapper
+ if (tail_offset) {
+ // verify crc
+ uint32_t actual = crc_front.crc32c(-1);
+ if (tail_offset < bl.get_off()) {
+ bufferlist tail;
+ tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
+ actual = tail.crc32c(actual);
+ }
+ if (crc != actual) {
+ ostringstream ss;
+ ss << "bad crc, actual " << actual << " != expected " << crc;
+ string s = ss.str();
+ throw buffer::malformed_input(s.c_str());
+ }
+ }
+
post_decode();
}
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index 7d440affd12..1792ea69706 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -155,6 +155,10 @@ public:
string cluster_snapshot;
+ mutable bool have_crc; ///< crc values are defined
+ uint32_t full_crc; ///< crc of the resulting OSDMap
+ mutable uint32_t inc_crc; ///< crc of this incremental
+
int get_net_marked_out(const OSDMap *previous) const;
int get_net_marked_down(const OSDMap *previous) const;
int identify_osd(uuid_d u) const;
@@ -169,7 +173,8 @@ public:
Incremental(epoch_t e=0) :
encode_features(0),
- epoch(e), new_pool_max(-1), new_flags(-1), new_max_osd(-1) {
+ epoch(e), new_pool_max(-1), new_flags(-1), new_max_osd(-1),
+ have_crc(false), full_crc(0), inc_crc(0) {
memset(&fsid, 0, sizeof(fsid));
}
Incremental(bufferlist &bl) {
@@ -240,7 +245,13 @@ private:
string cluster_snapshot;
bool new_blacklist_entries;
+ mutable bool crc_defined;
+ mutable uint32_t crc;
+
public:
+ bool have_crc() const { return crc_defined; }
+ uint32_t get_crc() const { return crc; }
+
ceph::shared_ptr<CrushWrapper> crush; // hierarchical map
friend class OSDMonitor;
@@ -258,6 +269,7 @@ private:
osd_uuid(new vector<uuid_d>),
cluster_snapshot_epoch(0),
new_blacklist_entries(false),
+ crc_defined(false), crc(0),
crush(new CrushWrapper) {
memset(&fsid, 0, sizeof(fsid));
}
diff --git a/src/test/encoding/ceph_dencoder.cc b/src/test/encoding/ceph_dencoder.cc
index 8391f33cb2f..d5068a1a4d4 100644
--- a/src/test/encoding/ceph_dencoder.cc
+++ b/src/test/encoding/ceph_dencoder.cc
@@ -12,12 +12,14 @@
#define TYPE(t)
#define TYPEWITHSTRAYDATA(t)
#define TYPE_FEATUREFUL(t)
+#define TYPE_FEATUREFUL_STRAYDATA(t)
#define TYPE_NOCOPY(t)
#define MESSAGE(t)
#include "types.h"
#undef TYPE
#undef TYPEWITHSTRAYDATA
#undef TYPE_FEATUREFUL
+#undef TYPE_FEATUREFUL_STRAYDATA
#undef TYPE_NOCOPY
#undef MESSAGE
@@ -239,12 +241,14 @@ int main(int argc, const char **argv)
#define TYPE(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeature<t>(false);
#define TYPEWITHSTRAYDATA(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeature<t>(true);
#define TYPE_FEATUREFUL(t) dencoders[T_STRINGIFY(t)] = new DencoderImplFeatureful<t>(false);
+#define TYPE_FEATUREFUL_STRAYDATA(t) dencoders[T_STRINGIFY(t)] = new DencoderImplFeatureful<t>(true);
#define TYPE_NOCOPY(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeatureNoCopy<t>(false);
#define MESSAGE(t) dencoders[T_STRINGIFY(t)] = new MessageDencoderImpl<t>;
#include "types.h"
#undef TYPE
#undef TYPEWITHSTRAYDATA
#undef TYPE_FEATUREFUL
+#undef TYPE_FEATUREFUL_STRAYDATA
#undef T_STR
#undef T_STRINGIFY
@@ -312,7 +316,7 @@ int main(int argc, const char **argv)
usage(cerr);
exit(1);
}
- den->encode(encbl, features);
+ den->encode(encbl, features | CEPH_FEATURE_RESERVED); // hack for OSDMap
} else if (*i == string("decode")) {
if (!den) {
cerr << "must first select type with 'type <name>'" << std::endl;
diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h
index 59a576e4521..d6934e95c7b 100644
--- a/src/test/encoding/types.h
+++ b/src/test/encoding/types.h
@@ -30,8 +30,8 @@ TYPE(entity_addr_t)
#include "osd/OSDMap.h"
TYPE(osd_info_t)
TYPE(osd_xinfo_t)
-TYPEWITHSTRAYDATA(OSDMap)
-TYPEWITHSTRAYDATA(OSDMap::Incremental)
+TYPE_FEATUREFUL_STRAYDATA(OSDMap)
+TYPE_FEATUREFUL_STRAYDATA(OSDMap::Incremental)
#include "crush/CrushWrapper.h"
TYPE_NOCOPY(CrushWrapper)
diff --git a/src/tools/osdmaptool.cc b/src/tools/osdmaptool.cc
index d84aeb8661c..a52eb6cae97 100644
--- a/src/tools/osdmaptool.cc
+++ b/src/tools/osdmaptool.cc
@@ -463,7 +463,7 @@ int main(int argc, const char **argv)
if (modified) {
bl.clear();
- osdmap.encode(bl);
+ osdmap.encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT | CEPH_FEATURE_RESERVED);
// write it out
cout << me << ": writing epoch " << osdmap.get_epoch()