summaryrefslogtreecommitdiffstats
path: root/src/mon
diff options
context:
space:
mode:
authorDaniel Baumann <daniel@debian.org>2024-11-09 17:08:52 +0100
committerDaniel Baumann <daniel@debian.org>2024-11-09 17:08:52 +0100
commit7ae5754b6d5f4f5ce4c4894a9d0f7247731e4d29 (patch)
tree4d551ffc1d3e175c528c69f06e22a9ac2ac41854 /src/mon
parentInitial commit. (diff)
downloadceph-19-7ae5754b6d5f4f5ce4c4894a9d0f7247731e4d29.tar.xz
ceph-19-7ae5754b6d5f4f5ce4c4894a9d0f7247731e4d29.zip
Adding upstream version 19.2.0.upstream/19.2.0upstream
Signed-off-by: Daniel Baumann <daniel@debian.org>
Diffstat (limited to 'src/mon')
-rw-r--r--src/mon/AuthMonitor.cc2269
-rw-r--r--src/mon/AuthMonitor.h283
-rw-r--r--src/mon/CMakeLists.txt46
-rw-r--r--src/mon/CommandHandler.cc43
-rw-r--r--src/mon/CommandHandler.h35
-rw-r--r--src/mon/ConfigMap.cc348
-rw-r--r--src/mon/ConfigMap.h172
-rw-r--r--src/mon/ConfigMonitor.cc981
-rw-r--r--src/mon/ConfigMonitor.h58
-rw-r--r--src/mon/ConnectionTracker.cc377
-rw-r--r--src/mon/ConnectionTracker.h205
-rw-r--r--src/mon/CreatingPGs.h246
-rw-r--r--src/mon/ElectionLogic.cc557
-rw-r--r--src/mon/ElectionLogic.h460
-rw-r--r--src/mon/Elector.cc807
-rw-r--r--src/mon/Elector.h406
-rw-r--r--src/mon/FSCommands.cc1972
-rw-r--r--src/mon/FSCommands.h98
-rw-r--r--src/mon/HealthMonitor.cc887
-rw-r--r--src/mon/HealthMonitor.h75
-rw-r--r--src/mon/KVMonitor.cc533
-rw-r--r--src/mon/KVMonitor.h69
-rw-r--r--src/mon/LogMonitor.cc1293
-rw-r--r--src/mon/LogMonitor.h210
-rw-r--r--src/mon/MDSMonitor.cc2547
-rw-r--r--src/mon/MDSMonitor.h162
-rw-r--r--src/mon/MgrMap.h693
-rw-r--r--src/mon/MgrMonitor.cc1499
-rw-r--r--src/mon/MgrMonitor.h147
-rw-r--r--src/mon/MgrStatMonitor.cc367
-rw-r--r--src/mon/MgrStatMonitor.h109
-rw-r--r--src/mon/MonCap.cc751
-rw-r--r--src/mon/MonCap.h231
-rw-r--r--src/mon/MonClient.cc2070
-rw-r--r--src/mon/MonClient.h789
-rw-r--r--src/mon/MonCommand.h194
-rw-r--r--src/mon/MonCommands.h1453
-rw-r--r--src/mon/MonMap.cc1028
-rw-r--r--src/mon/MonMap.h550
-rw-r--r--src/mon/MonOpRequest.h238
-rw-r--r--src/mon/MonSub.cc114
-rw-r--r--src/mon/MonSub.h46
-rw-r--r--src/mon/Monitor.cc6961
-rw-r--r--src/mon/Monitor.h1163
-rw-r--r--src/mon/MonitorDBStore.h823
-rw-r--r--src/mon/MonmapMonitor.cc1462
-rw-r--r--src/mon/MonmapMonitor.h111
-rw-r--r--src/mon/OSDMonitor.cc15036
-rw-r--r--src/mon/OSDMonitor.h883
-rw-r--r--src/mon/PGMap.cc4101
-rw-r--r--src/mon/PGMap.h570
-rw-r--r--src/mon/Paxos.cc1591
-rw-r--r--src/mon/Paxos.h1384
-rw-r--r--src/mon/PaxosFSMap.h101
-rw-r--r--src/mon/PaxosService.cc476
-rw-r--r--src/mon/PaxosService.h916
-rw-r--r--src/mon/Session.h295
-rw-r--r--src/mon/error_code.cc85
-rw-r--r--src/mon/error_code.h49
-rw-r--r--src/mon/health_check.h199
-rw-r--r--src/mon/mon_types.h723
61 files changed, 62347 insertions, 0 deletions
diff --git a/src/mon/AuthMonitor.cc b/src/mon/AuthMonitor.cc
new file mode 100644
index 000000000..88f843f3e
--- /dev/null
+++ b/src/mon/AuthMonitor.cc
@@ -0,0 +1,2269 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <sstream>
+
+#include "mon/AuthMonitor.h"
+#include "mon/Monitor.h"
+#include "mon/MonitorDBStore.h"
+#include "mon/OSDMonitor.h"
+#include "mon/MDSMonitor.h"
+#include "mon/ConfigMonitor.h"
+
+#include "messages/MMonCommand.h"
+#include "messages/MAuth.h"
+#include "messages/MAuthReply.h"
+#include "messages/MMonGlobalID.h"
+#include "messages/MMonUsedPendingKeys.h"
+#include "msg/Messenger.h"
+
+#include "auth/AuthServiceHandler.h"
+#include "auth/KeyRing.h"
+#include "include/stringify.h"
+#include "include/ceph_assert.h"
+
+#include "mds/MDSAuthCaps.h"
+#include "mgr/MgrCap.h"
+#include "osd/OSDCap.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, get_last_committed())
+using namespace TOPNSPC::common;
+
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+
+static ostream& _prefix(ostream *_dout, Monitor &mon, version_t v) {
+ return *_dout << "mon." << mon.name << "@" << mon.rank
+ << "(" << mon.get_state_name()
+ << ").auth v" << v << " ";
+}
+
+ostream& operator<<(ostream &out, const AuthMonitor &pm)
+{
+ return out << "auth";
+}
+
+bool AuthMonitor::check_rotate()
+{
+ KeyServerData::Incremental rot_inc;
+ rot_inc.op = KeyServerData::AUTH_INC_SET_ROTATING;
+ if (mon.key_server.prepare_rotating_update(rot_inc.rotating_bl)) {
+ dout(10) << __func__ << " updating rotating" << dendl;
+ push_cephx_inc(rot_inc);
+ return true;
+ }
+ return false;
+}
+
+void AuthMonitor::process_used_pending_keys(
+ const map<EntityName,CryptoKey>& used_pending_keys)
+{
+ for (auto& [name, used_key] : used_pending_keys) {
+ dout(10) << __func__ << " used pending_key for " << name << dendl;
+ KeyServerData::Incremental inc;
+ inc.op = KeyServerData::AUTH_INC_ADD;
+ inc.name = name;
+
+ mon.key_server.get_auth(name, inc.auth);
+ for (auto& p : pending_auth) {
+ if (p.inc_type == AUTH_DATA) {
+ KeyServerData::Incremental auth_inc;
+ auto q = p.auth_data.cbegin();
+ decode(auth_inc, q);
+ if (auth_inc.op == KeyServerData::AUTH_INC_ADD &&
+ auth_inc.name == name) {
+ dout(10) << __func__ << " starting with pending uncommitted" << dendl;
+ inc.auth = auth_inc.auth;
+ }
+ }
+ }
+ if (stringify(inc.auth.pending_key) == stringify(used_key)) {
+ dout(10) << __func__ << " committing pending_key -> key for "
+ << name << dendl;
+ inc.auth.key = inc.auth.pending_key;
+ inc.auth.pending_key.clear();
+ push_cephx_inc(inc);
+ }
+ }
+}
+
+/*
+ Tick function to update the map based on performance every N seconds
+*/
+
+void AuthMonitor::tick()
+{
+ if (!is_active()) return;
+
+ dout(10) << *this << dendl;
+
+ // increase global_id?
+ bool propose = false;
+ bool increase;
+ {
+ std::lock_guard l(mon.auth_lock);
+ increase = _should_increase_max_global_id();
+ }
+ if (increase) {
+ if (mon.is_leader()) {
+ increase_max_global_id();
+ propose = true;
+ } else {
+ dout(10) << __func__ << "requesting more ids from leader" << dendl;
+ MMonGlobalID *req = new MMonGlobalID();
+ req->old_max_id = max_global_id;
+ mon.send_mon_message(req, mon.get_leader());
+ }
+ }
+
+ if (mon.monmap->min_mon_release >= ceph_release_t::quincy) {
+ auto used_pending_keys = mon.key_server.get_used_pending_keys();
+ if (!used_pending_keys.empty()) {
+ dout(10) << __func__ << " " << used_pending_keys.size() << " used pending_keys"
+ << dendl;
+ if (mon.is_leader()) {
+ process_used_pending_keys(used_pending_keys);
+ propose = true;
+ } else {
+ MMonUsedPendingKeys *req = new MMonUsedPendingKeys();
+ req->used_pending_keys = used_pending_keys;
+ mon.send_mon_message(req, mon.get_leader());
+ }
+ }
+ }
+
+ if (!mon.is_leader()) {
+ return;
+ }
+
+ if (check_rotate()) {
+ propose = true;
+ }
+
+ if (propose) {
+ propose_pending();
+ }
+}
+
+void AuthMonitor::on_active()
+{
+ dout(10) << "AuthMonitor::on_active()" << dendl;
+
+ if (!mon.is_leader())
+ return;
+
+ mon.key_server.start_server();
+ mon.key_server.clear_used_pending_keys();
+
+ if (is_writeable()) {
+ bool propose = false;
+ if (check_rotate()) {
+ propose = true;
+ }
+ bool increase;
+ {
+ std::lock_guard l(mon.auth_lock);
+ increase = _should_increase_max_global_id();
+ }
+ if (increase) {
+ increase_max_global_id();
+ propose = true;
+ }
+ if (propose) {
+ propose_pending();
+ }
+ }
+}
+
+bufferlist _encode_cap(const string& cap)
+{
+ bufferlist bl;
+ encode(cap, bl);
+ return bl;
+}
+
+void AuthMonitor::get_initial_keyring(KeyRing *keyring)
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(keyring != nullptr);
+
+ bufferlist bl;
+ int ret = mon.store->get("mkfs", "keyring", bl);
+ if (ret == -ENOENT) {
+ return;
+ }
+ // fail hard only if there's an error we're not expecting to see
+ ceph_assert(ret == 0);
+
+ auto p = bl.cbegin();
+ decode(*keyring, p);
+}
+
+void _generate_bootstrap_keys(
+ list<pair<EntityName,EntityAuth> >* auth_lst)
+{
+ ceph_assert(auth_lst != nullptr);
+
+ map<string,map<string,bufferlist> > bootstrap = {
+ { "admin", {
+ { "mon", _encode_cap("allow *") },
+ { "osd", _encode_cap("allow *") },
+ { "mds", _encode_cap("allow *") },
+ { "mgr", _encode_cap("allow *") }
+ } },
+ { "bootstrap-osd", {
+ { "mon", _encode_cap("allow profile bootstrap-osd") }
+ } },
+ { "bootstrap-rgw", {
+ { "mon", _encode_cap("allow profile bootstrap-rgw") }
+ } },
+ { "bootstrap-mds", {
+ { "mon", _encode_cap("allow profile bootstrap-mds") }
+ } },
+ { "bootstrap-mgr", {
+ { "mon", _encode_cap("allow profile bootstrap-mgr") }
+ } },
+ { "bootstrap-rbd", {
+ { "mon", _encode_cap("allow profile bootstrap-rbd") }
+ } },
+ { "bootstrap-rbd-mirror", {
+ { "mon", _encode_cap("allow profile bootstrap-rbd-mirror") }
+ } }
+ };
+
+ for (auto &p : bootstrap) {
+ EntityName name;
+ name.from_str("client." + p.first);
+ EntityAuth auth;
+ auth.key.create(g_ceph_context, CEPH_CRYPTO_AES);
+ auth.caps = p.second;
+
+ auth_lst->push_back(make_pair(name, auth));
+ }
+}
+
+void AuthMonitor::create_initial_keys(KeyRing *keyring)
+{
+ dout(10) << __func__ << " with keyring" << dendl;
+ ceph_assert(keyring != nullptr);
+
+ list<pair<EntityName,EntityAuth> > auth_lst;
+ _generate_bootstrap_keys(&auth_lst);
+
+ for (auto &p : auth_lst) {
+ if (keyring->exists(p.first)) {
+ continue;
+ }
+ keyring->add(p.first, p.second);
+ }
+}
+
+void AuthMonitor::create_initial()
+{
+ dout(10) << "create_initial -- creating initial map" << dendl;
+
+ // initialize rotating keys
+ mon.key_server.clear_secrets();
+ check_rotate();
+ ceph_assert(pending_auth.size() == 1);
+
+ if (mon.is_keyring_required()) {
+ KeyRing keyring;
+ // attempt to obtain an existing mkfs-time keyring
+ get_initial_keyring(&keyring);
+ // create missing keys in the keyring
+ create_initial_keys(&keyring);
+ // import the resulting keyring
+ import_keyring(keyring);
+ }
+
+ max_global_id = MIN_GLOBAL_ID;
+
+ Incremental inc;
+ inc.inc_type = GLOBAL_ID;
+ inc.max_global_id = max_global_id;
+ pending_auth.push_back(inc);
+
+ format_version = 3;
+}
+
+void AuthMonitor::update_from_paxos(bool *need_bootstrap)
+{
+ dout(10) << __func__ << dendl;
+ load_health();
+
+ version_t version = get_last_committed();
+ version_t keys_ver = mon.key_server.get_ver();
+ if (version == keys_ver)
+ return;
+ ceph_assert(version > keys_ver);
+
+ version_t latest_full = get_version_latest_full();
+
+ dout(10) << __func__ << " version " << version << " keys ver " << keys_ver
+ << " latest " << latest_full << dendl;
+
+ if ((latest_full > 0) && (latest_full > keys_ver)) {
+ bufferlist latest_bl;
+ int err = get_version_full(latest_full, latest_bl);
+ ceph_assert(err == 0);
+ ceph_assert(latest_bl.length() != 0);
+ dout(7) << __func__ << " loading summary e " << latest_full << dendl;
+ dout(7) << __func__ << " latest length " << latest_bl.length() << dendl;
+ auto p = latest_bl.cbegin();
+ __u8 struct_v;
+ decode(struct_v, p);
+ decode(max_global_id, p);
+ decode(mon.key_server, p);
+ mon.key_server.set_ver(latest_full);
+ keys_ver = latest_full;
+ }
+
+ dout(10) << __func__ << " key server version " << mon.key_server.get_ver() << dendl;
+
+ // walk through incrementals
+ while (version > keys_ver) {
+ bufferlist bl;
+ int ret = get_version(keys_ver+1, bl);
+ ceph_assert(ret == 0);
+ ceph_assert(bl.length());
+
+ // reset if we are moving to initial state. we will normally have
+ // keys in here temporarily for bootstrapping that we need to
+ // clear out.
+ if (keys_ver == 0)
+ mon.key_server.clear_secrets();
+
+ dout(20) << __func__ << " walking through version " << (keys_ver+1)
+ << " len " << bl.length() << dendl;
+
+ auto p = bl.cbegin();
+ __u8 v;
+ decode(v, p);
+ while (!p.end()) {
+ Incremental inc;
+ decode(inc, p);
+ switch (inc.inc_type) {
+ case GLOBAL_ID:
+ max_global_id = inc.max_global_id;
+ break;
+
+ case AUTH_DATA:
+ {
+ KeyServerData::Incremental auth_inc;
+ auto iter = inc.auth_data.cbegin();
+ decode(auth_inc, iter);
+ mon.key_server.apply_data_incremental(auth_inc);
+ break;
+ }
+ }
+ }
+
+ keys_ver++;
+ mon.key_server.set_ver(keys_ver);
+
+ if (keys_ver == 1 && mon.is_keyring_required()) {
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ t->erase("mkfs", "keyring");
+ mon.store->apply_transaction(t);
+ }
+ }
+
+ {
+ std::lock_guard l(mon.auth_lock);
+ if (last_allocated_id == 0) {
+ last_allocated_id = max_global_id;
+ dout(10) << __func__ << " last_allocated_id initialized to "
+ << max_global_id << dendl;
+ }
+ }
+
+ dout(10) << __func__ << " max_global_id=" << max_global_id
+ << " format_version " << format_version
+ << dendl;
+
+ mon.key_server.dump();
+}
+
+bool AuthMonitor::_should_increase_max_global_id()
+{
+ ceph_assert(ceph_mutex_is_locked(mon.auth_lock));
+ auto num_prealloc = g_conf()->mon_globalid_prealloc;
+ if (max_global_id < num_prealloc ||
+ (last_allocated_id + 1) >= max_global_id - num_prealloc / 2) {
+ return true;
+ }
+ return false;
+}
+
+void AuthMonitor::increase_max_global_id()
+{
+ ceph_assert(mon.is_leader());
+
+ Incremental inc;
+ inc.inc_type = GLOBAL_ID;
+ inc.max_global_id = max_global_id + g_conf()->mon_globalid_prealloc;
+ dout(10) << "increasing max_global_id to " << inc.max_global_id << dendl;
+ pending_auth.push_back(inc);
+}
+
+bool AuthMonitor::should_propose(double& delay)
+{
+ return (!pending_auth.empty());
+}
+
+void AuthMonitor::create_pending()
+{
+ pending_auth.clear();
+ dout(10) << "create_pending v " << (get_last_committed() + 1) << dendl;
+}
+
+void AuthMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+ dout(10) << __func__ << " v " << (get_last_committed() + 1) << dendl;
+
+ bufferlist bl;
+
+ __u8 v = 1;
+ encode(v, bl);
+ vector<Incremental>::iterator p;
+ for (p = pending_auth.begin(); p != pending_auth.end(); ++p)
+ p->encode(bl, mon.get_quorum_con_features());
+
+ version_t version = get_last_committed() + 1;
+ put_version(t, version, bl);
+ put_last_committed(t, version);
+
+ // health
+ health_check_map_t next;
+ map<string,list<string>> bad_detail; // entity -> details
+ for (auto i = mon.key_server.secrets_begin();
+ i != mon.key_server.secrets_end();
+ ++i) {
+ for (auto& p : i->second.caps) {
+ ostringstream ss;
+ if (!valid_caps(p.first, p.second, &ss)) {
+ ostringstream ss2;
+ ss2 << i->first << " " << ss.str();
+ bad_detail[i->first.to_str()].push_back(ss2.str());
+ }
+ }
+ }
+ for (auto& inc : pending_auth) {
+ if (inc.inc_type == AUTH_DATA) {
+ KeyServerData::Incremental auth_inc;
+ auto iter = inc.auth_data.cbegin();
+ decode(auth_inc, iter);
+ if (auth_inc.op == KeyServerData::AUTH_INC_DEL) {
+ bad_detail.erase(auth_inc.name.to_str());
+ } else if (auth_inc.op == KeyServerData::AUTH_INC_ADD) {
+ for (auto& p : auth_inc.auth.caps) {
+ ostringstream ss;
+ if (!valid_caps(p.first, p.second, &ss)) {
+ ostringstream ss2;
+ ss2 << auth_inc.name << " " << ss.str();
+ bad_detail[auth_inc.name.to_str()].push_back(ss2.str());
+ }
+ }
+ }
+ }
+ }
+ if (bad_detail.size()) {
+ ostringstream ss;
+ ss << bad_detail.size() << " auth entities have invalid capabilities";
+ health_check_t *check = &next.add("AUTH_BAD_CAPS", HEALTH_ERR, ss.str(),
+ bad_detail.size());
+ for (auto& i : bad_detail) {
+ for (auto& j : i.second) {
+ check->detail.push_back(j);
+ }
+ }
+ }
+ encode_health(next, t);
+}
+
+void AuthMonitor::encode_full(MonitorDBStore::TransactionRef t)
+{
+ version_t version = mon.key_server.get_ver();
+ // do not stash full version 0 as it will never be removed nor read
+ if (version == 0)
+ return;
+
+ dout(10) << __func__ << " auth v " << version << dendl;
+ ceph_assert(get_last_committed() == version);
+
+ bufferlist full_bl;
+ std::scoped_lock l{mon.key_server.get_lock()};
+ dout(20) << __func__ << " key server has "
+ << (mon.key_server.has_secrets() ? "" : "no ")
+ << "secrets!" << dendl;
+ __u8 v = 1;
+ encode(v, full_bl);
+ encode(max_global_id, full_bl);
+ encode(mon.key_server, full_bl);
+
+ put_version_full(t, version, full_bl);
+ put_version_latest_full(t, version);
+}
+
+version_t AuthMonitor::get_trim_to() const
+{
+ unsigned max = g_conf()->paxos_max_join_drift * 2;
+ version_t version = get_last_committed();
+ if (mon.is_leader() && (version > max))
+ return version - max;
+ return 0;
+}
+
+bool AuthMonitor::preprocess_query(MonOpRequestRef op)
+{
+ auto m = op->get_req<PaxosServiceMessage>();
+ dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
+ switch (m->get_type()) {
+ case MSG_MON_COMMAND:
+ try {
+ return preprocess_command(op);
+ } catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
+
+ case CEPH_MSG_AUTH:
+ return prep_auth(op, false);
+
+ case MSG_MON_GLOBAL_ID:
+ return false;
+
+ case MSG_MON_USED_PENDING_KEYS:
+ return false;
+
+ default:
+ ceph_abort();
+ return true;
+ }
+}
+
+bool AuthMonitor::prepare_update(MonOpRequestRef op)
+{
+ auto m = op->get_req<PaxosServiceMessage>();
+ dout(10) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
+ switch (m->get_type()) {
+ case MSG_MON_COMMAND:
+ try {
+ return prepare_command(op);
+ } catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
+ case MSG_MON_GLOBAL_ID:
+ return prepare_global_id(op);
+ case MSG_MON_USED_PENDING_KEYS:
+ return prepare_used_pending_keys(op);
+ case CEPH_MSG_AUTH:
+ return prep_auth(op, true);
+ default:
+ ceph_abort();
+ return false;
+ }
+}
+
+void AuthMonitor::_set_mon_num_rank(int num, int rank)
+{
+ dout(10) << __func__ << " num " << num << " rank " << rank << dendl;
+ ceph_assert(ceph_mutex_is_locked(mon.auth_lock));
+ mon_num = num;
+ mon_rank = rank;
+}
+
+uint64_t AuthMonitor::_assign_global_id()
+{
+ ceph_assert(ceph_mutex_is_locked(mon.auth_lock));
+ if (mon_num < 1 || mon_rank < 0) {
+ dout(10) << __func__ << " inactive (num_mon " << mon_num
+ << " rank " << mon_rank << ")" << dendl;
+ return 0;
+ }
+ if (!last_allocated_id) {
+ dout(10) << __func__ << " last_allocated_id == 0" << dendl;
+ return 0;
+ }
+
+ uint64_t id = last_allocated_id + 1;
+ int remainder = id % mon_num;
+ if (remainder) {
+ remainder = mon_num - remainder;
+ }
+ id += remainder + mon_rank;
+
+ if (id >= max_global_id) {
+ dout(10) << __func__ << " failed (max " << max_global_id << ")" << dendl;
+ return 0;
+ }
+
+ last_allocated_id = id;
+ dout(10) << __func__ << " " << id << " (max " << max_global_id << ")"
+ << dendl;
+ return id;
+}
+
+uint64_t AuthMonitor::assign_global_id(bool should_increase_max)
+{
+ uint64_t id;
+ {
+ std::lock_guard l(mon.auth_lock);
+ id =_assign_global_id();
+ if (should_increase_max) {
+ should_increase_max = _should_increase_max_global_id();
+ }
+ }
+ if (mon.is_leader() &&
+ should_increase_max) {
+ increase_max_global_id();
+ }
+ return id;
+}
+
+bool AuthMonitor::prep_auth(MonOpRequestRef op, bool paxos_writable)
+{
+ auto m = op->get_req<MAuth>();
+ dout(10) << "prep_auth() blob_size=" << m->get_auth_payload().length() << dendl;
+
+ MonSession *s = op->get_session();
+ if (!s) {
+ dout(10) << "no session, dropping" << dendl;
+ return true;
+ }
+
+ int ret = 0;
+ MAuthReply *reply;
+ bufferlist response_bl;
+ auto indata = m->auth_payload.cbegin();
+ __u32 proto = m->protocol;
+ bool start = false;
+ bool finished = false;
+ EntityName entity_name;
+ bool is_new_global_id = false;
+
+ // set up handler?
+ if (m->protocol == 0 && !s->auth_handler) {
+ set<__u32> supported;
+
+ try {
+ __u8 struct_v = 1;
+ decode(struct_v, indata);
+ decode(supported, indata);
+ decode(entity_name, indata);
+ decode(s->con->peer_global_id, indata);
+ } catch (const ceph::buffer::error &e) {
+ dout(10) << "failed to decode initial auth message" << dendl;
+ ret = -EINVAL;
+ goto reply;
+ }
+
+ // do we require cephx signatures?
+
+ if (!m->get_connection()->has_feature(CEPH_FEATURE_MSG_AUTH)) {
+ if (entity_name.get_type() == CEPH_ENTITY_TYPE_MON ||
+ entity_name.get_type() == CEPH_ENTITY_TYPE_OSD ||
+ entity_name.get_type() == CEPH_ENTITY_TYPE_MDS ||
+ entity_name.get_type() == CEPH_ENTITY_TYPE_MGR) {
+ if (g_conf()->cephx_cluster_require_signatures ||
+ g_conf()->cephx_require_signatures) {
+ dout(1) << m->get_source_inst()
+ << " supports cephx but not signatures and"
+ << " 'cephx [cluster] require signatures = true';"
+ << " disallowing cephx" << dendl;
+ supported.erase(CEPH_AUTH_CEPHX);
+ }
+ } else {
+ if (g_conf()->cephx_service_require_signatures ||
+ g_conf()->cephx_require_signatures) {
+ dout(1) << m->get_source_inst()
+ << " supports cephx but not signatures and"
+ << " 'cephx [service] require signatures = true';"
+ << " disallowing cephx" << dendl;
+ supported.erase(CEPH_AUTH_CEPHX);
+ }
+ }
+ } else if (!m->get_connection()->has_feature(CEPH_FEATURE_CEPHX_V2)) {
+ if (entity_name.get_type() == CEPH_ENTITY_TYPE_MON ||
+ entity_name.get_type() == CEPH_ENTITY_TYPE_OSD ||
+ entity_name.get_type() == CEPH_ENTITY_TYPE_MDS ||
+ entity_name.get_type() == CEPH_ENTITY_TYPE_MGR) {
+ if (g_conf()->cephx_cluster_require_version >= 2 ||
+ g_conf()->cephx_require_version >= 2) {
+ dout(1) << m->get_source_inst()
+ << " supports cephx but not v2 and"
+ << " 'cephx [cluster] require version >= 2';"
+ << " disallowing cephx" << dendl;
+ supported.erase(CEPH_AUTH_CEPHX);
+ }
+ } else {
+ if (g_conf()->cephx_service_require_version >= 2 ||
+ g_conf()->cephx_require_version >= 2) {
+ dout(1) << m->get_source_inst()
+ << " supports cephx but not v2 and"
+ << " 'cephx [service] require version >= 2';"
+ << " disallowing cephx" << dendl;
+ supported.erase(CEPH_AUTH_CEPHX);
+ }
+ }
+ }
+
+ int type;
+ if (entity_name.get_type() == CEPH_ENTITY_TYPE_MON ||
+ entity_name.get_type() == CEPH_ENTITY_TYPE_OSD ||
+ entity_name.get_type() == CEPH_ENTITY_TYPE_MDS ||
+ entity_name.get_type() == CEPH_ENTITY_TYPE_MGR)
+ type = mon.auth_cluster_required.pick(supported);
+ else
+ type = mon.auth_service_required.pick(supported);
+
+ s->auth_handler = get_auth_service_handler(type, g_ceph_context, &mon.key_server);
+ if (!s->auth_handler) {
+ dout(1) << "client did not provide supported auth type" << dendl;
+ ret = -ENOTSUP;
+ goto reply;
+ }
+ start = true;
+ proto = type;
+ } else if (!s->auth_handler) {
+ dout(10) << "protocol specified but no s->auth_handler" << dendl;
+ ret = -EINVAL;
+ goto reply;
+ }
+
+ /* assign a new global_id? we assume this should only happen on the first
+ request. If a client tries to send it later, it'll screw up its auth
+ session */
+ if (!s->con->peer_global_id) {
+ s->con->peer_global_id = assign_global_id(paxos_writable);
+ if (!s->con->peer_global_id) {
+
+ delete s->auth_handler;
+ s->auth_handler = NULL;
+
+ if (mon.is_leader() && paxos_writable) {
+ dout(10) << "increasing global id, waitlisting message" << dendl;
+ wait_for_active(op, new C_RetryMessage(this, op));
+ goto done;
+ }
+
+ if (!mon.is_leader()) {
+ dout(10) << "not the leader, requesting more ids from leader" << dendl;
+ int leader = mon.get_leader();
+ MMonGlobalID *req = new MMonGlobalID();
+ req->old_max_id = max_global_id;
+ mon.send_mon_message(req, leader);
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return true;
+ }
+
+ ceph_assert(!paxos_writable);
+ return false;
+ }
+ is_new_global_id = true;
+ }
+
+ try {
+ if (start) {
+ // new session
+ ret = s->auth_handler->start_session(entity_name,
+ s->con->peer_global_id,
+ is_new_global_id,
+ &response_bl,
+ &s->con->peer_caps_info);
+ } else {
+ // request
+ ret = s->auth_handler->handle_request(
+ indata,
+ 0, // no connection_secret needed
+ &response_bl,
+ &s->con->peer_caps_info,
+ nullptr, nullptr);
+ }
+ if (ret == -EIO) {
+ wait_for_active(op, new C_RetryMessage(this,op));
+ goto done;
+ }
+ if (ret > 0) {
+ if (!s->authenticated &&
+ mon.ms_handle_fast_authentication(s->con.get()) > 0) {
+ finished = true;
+ }
+ ret = 0;
+ }
+ } catch (const ceph::buffer::error &err) {
+ ret = -EINVAL;
+ dout(0) << "caught error when trying to handle auth request, probably malformed request" << dendl;
+ }
+
+reply:
+ reply = new MAuthReply(proto, &response_bl, ret, s->con->peer_global_id);
+ mon.send_reply(op, reply);
+ if (finished) {
+ // always send the latest monmap.
+ if (m->monmap_epoch < mon.monmap->get_epoch())
+ mon.send_latest_monmap(m->get_connection().get());
+
+ mon.configmon()->check_sub(s);
+ }
+done:
+ return true;
+}
+
+bool AuthMonitor::preprocess_command(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonCommand>();
+ int r = -1;
+ bufferlist rdata;
+ stringstream ss, ds;
+
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ // ss has reason for failure
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+ return true;
+ }
+
+ string prefix;
+ cmd_getval(cmdmap, "prefix", prefix);
+ if (prefix == "auth add" ||
+ prefix == "auth del" ||
+ prefix == "auth rm" ||
+ prefix == "auth get-or-create" ||
+ prefix == "auth get-or-create-key" ||
+ prefix == "auth get-or-create-pending" ||
+ prefix == "auth clear-pending" ||
+ prefix == "auth commit-pending" ||
+ prefix == "fs authorize" ||
+ prefix == "auth import" ||
+ prefix == "auth caps") {
+ return false;
+ }
+
+ MonSession *session = op->get_session();
+ if (!session) {
+ mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
+ return true;
+ }
+
+ // entity might not be supplied, but if it is, it should be valid
+ string entity_name;
+ cmd_getval(cmdmap, "entity", entity_name);
+ EntityName entity;
+ if (!entity_name.empty() && !entity.from_str(entity_name)) {
+ ss << "invalid entity_auth " << entity_name;
+ mon.reply_command(op, -EINVAL, ss.str(), get_last_committed());
+ return true;
+ }
+
+ string format = cmd_getval_or<string>(cmdmap, "format", "plain");
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+ if (prefix == "auth export") {
+ KeyRing keyring;
+ export_keyring(keyring);
+ if (!entity_name.empty()) {
+ EntityAuth eauth;
+ if (keyring.get_auth(entity, eauth)) {
+ _encode_auth(entity, eauth, rdata, f.get());
+ r = 0;
+ } else {
+ ss << "no key for " << eauth;
+ r = -ENOENT;
+ }
+ } else {
+ if (f)
+ keyring.encode_formatted("auth", f.get(), rdata);
+ else
+ keyring.encode_plaintext(rdata);
+ r = 0;
+ }
+ } else if (prefix == "auth get" && !entity_name.empty()) {
+ EntityAuth entity_auth;
+ if (!mon.key_server.get_auth(entity, entity_auth)) {
+ ss << "failed to find " << entity_name << " in keyring";
+ r = -ENOENT;
+ } else {
+ _encode_auth(entity, entity_auth, rdata, f.get());
+ r = 0;
+ }
+ } else if (prefix == "auth print-key" ||
+ prefix == "auth print_key" ||
+ prefix == "auth get-key") {
+ EntityAuth auth;
+ if (!mon.key_server.get_auth(entity, auth)) {
+ ss << "don't have " << entity;
+ r = -ENOENT;
+ goto done;
+ }
+ if (f) {
+ auth.key.encode_formatted("auth", f.get(), rdata);
+ } else {
+ auth.key.encode_plaintext(rdata);
+ }
+ r = 0;
+ } else if (prefix == "auth list" ||
+ prefix == "auth ls") {
+ if (f) {
+ mon.key_server.encode_formatted("auth", f.get(), rdata);
+ } else {
+ mon.key_server.encode_plaintext(rdata);
+ }
+ r = 0;
+ goto done;
+ } else {
+ ss << "invalid command";
+ r = -EINVAL;
+ }
+
+ done:
+ rdata.append(ds);
+ string rs;
+ getline(ss, rs, '\0');
+ mon.reply_command(op, r, rs, rdata, get_last_committed());
+ return true;
+}
+
+void AuthMonitor::export_keyring(KeyRing& keyring)
+{
+ mon.key_server.export_keyring(keyring);
+}
+
+int AuthMonitor::import_keyring(KeyRing& keyring)
+{
+ dout(10) << __func__ << " " << keyring.size() << " keys" << dendl;
+
+ for (map<EntityName, EntityAuth>::iterator p = keyring.get_keys().begin();
+ p != keyring.get_keys().end();
+ ++p) {
+ if (p->second.caps.empty()) {
+ dout(0) << "import: no caps supplied" << dendl;
+ return -EINVAL;
+ }
+ int err = add_entity(p->first, p->second);
+ ceph_assert(err == 0);
+ }
+ return 0;
+}
+
+int AuthMonitor::remove_entity(const EntityName &entity)
+{
+ dout(10) << __func__ << " " << entity << dendl;
+ if (!mon.key_server.contains(entity))
+ return -ENOENT;
+
+ KeyServerData::Incremental auth_inc;
+ auth_inc.name = entity;
+ auth_inc.op = KeyServerData::AUTH_INC_DEL;
+ push_cephx_inc(auth_inc);
+
+ return 0;
+}
+
+bool AuthMonitor::entity_is_pending(EntityName& entity)
+{
+ // are we about to have it?
+ for (auto& p : pending_auth) {
+ if (p.inc_type == AUTH_DATA) {
+ KeyServerData::Incremental inc;
+ auto q = p.auth_data.cbegin();
+ decode(inc, q);
+ if (inc.op == KeyServerData::AUTH_INC_ADD &&
+ inc.name == entity) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+int AuthMonitor::exists_and_matches_entity(
+ const auth_entity_t& entity,
+ bool has_secret,
+ stringstream& ss)
+{
+ return exists_and_matches_entity(entity.name, entity.auth,
+ entity.auth.caps, has_secret, ss);
+}
+
+int AuthMonitor::exists_and_matches_entity(
+ const EntityName& name,
+ const EntityAuth& auth,
+ const map<string,bufferlist>& caps,
+ bool has_secret,
+ stringstream& ss)
+{
+
+ dout(20) << __func__ << " entity " << name << " auth " << auth
+ << " caps " << caps << " has_secret " << has_secret << dendl;
+
+ EntityAuth existing_auth;
+ // does entry already exist?
+ if (mon.key_server.get_auth(name, existing_auth)) {
+ // key match?
+ if (has_secret) {
+ if (existing_auth.key.get_secret().cmp(auth.key.get_secret())) {
+ ss << "entity " << name << " exists but key does not match";
+ return -EEXIST;
+ }
+ }
+
+ // caps match?
+ if (caps.size() != existing_auth.caps.size()) {
+ ss << "entity " << name << " exists but caps do not match";
+ return -EINVAL;
+ }
+ for (auto& it : caps) {
+ if (existing_auth.caps.count(it.first) == 0 ||
+ !existing_auth.caps[it.first].contents_equal(it.second)) {
+ ss << "entity " << name << " exists but cap "
+ << it.first << " does not match";
+ return -EINVAL;
+ }
+ }
+
+ // they match, no-op
+ return 0;
+ }
+ return -ENOENT;
+}
+
+int AuthMonitor::add_entity(
+ const EntityName& name,
+ const EntityAuth& auth)
+{
+
+ // okay, add it.
+ KeyServerData::Incremental auth_inc;
+ auth_inc.op = KeyServerData::AUTH_INC_ADD;
+ auth_inc.name = name;
+ auth_inc.auth = auth;
+
+ dout(10) << " add auth entity " << auth_inc.name << dendl;
+ dout(30) << " " << auth_inc.auth << dendl;
+ push_cephx_inc(auth_inc);
+ return 0;
+}
+
+int AuthMonitor::validate_osd_destroy(
+ int32_t id,
+ const uuid_d& uuid,
+ EntityName& cephx_entity,
+ EntityName& lockbox_entity,
+ stringstream& ss)
+{
+ ceph_assert(paxos.is_plugged());
+
+ dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
+
+ string cephx_str = "osd." + stringify(id);
+ string lockbox_str = "client.osd-lockbox." + stringify(uuid);
+
+ if (!cephx_entity.from_str(cephx_str)) {
+ dout(10) << __func__ << " invalid cephx entity '"
+ << cephx_str << "'" << dendl;
+ ss << "invalid cephx key entity '" << cephx_str << "'";
+ return -EINVAL;
+ }
+
+ if (!lockbox_entity.from_str(lockbox_str)) {
+ dout(10) << __func__ << " invalid lockbox entity '"
+ << lockbox_str << "'" << dendl;
+ ss << "invalid lockbox key entity '" << lockbox_str << "'";
+ return -EINVAL;
+ }
+
+ if (!mon.key_server.contains(cephx_entity) &&
+ !mon.key_server.contains(lockbox_entity)) {
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+void AuthMonitor::do_osd_destroy(
+ const EntityName& cephx_entity,
+ const EntityName& lockbox_entity)
+{
+ ceph_assert(paxos.is_plugged());
+ ceph_assert(is_writeable());
+
+ dout(10) << __func__ << " cephx " << cephx_entity
+ << " lockbox " << lockbox_entity << dendl;
+
+ bool removed = false;
+
+ int err = remove_entity(cephx_entity);
+ if (err == -ENOENT) {
+ dout(10) << __func__ << " " << cephx_entity << " does not exist" << dendl;
+ } else {
+ removed = true;
+ }
+
+ err = remove_entity(lockbox_entity);
+ if (err == -ENOENT) {
+ dout(10) << __func__ << " " << lockbox_entity << " does not exist" << dendl;
+ } else {
+ removed = true;
+ }
+
+ if (!removed) {
+ dout(10) << __func__ << " entities do not exist -- no-op." << dendl;
+ return;
+ }
+
+ // given we have paxos plugged, this will not result in a proposal
+ // being triggered, but it will still be needed so that we get our
+ // pending state encoded into the paxos' pending transaction.
+ propose_pending();
+}
+
+int _create_auth(
+ EntityAuth& auth,
+ const string& key,
+ const map<string,bufferlist>& caps)
+{
+ if (key.empty())
+ return -EINVAL;
+ try {
+ auth.key.decode_base64(key);
+ } catch (ceph::buffer::error& e) {
+ return -EINVAL;
+ }
+ auth.caps = caps;
+ return 0;
+}
+
+int AuthMonitor::validate_osd_new(
+ int32_t id,
+ const uuid_d& uuid,
+ const string& cephx_secret,
+ const string& lockbox_secret,
+ auth_entity_t& cephx_entity,
+ auth_entity_t& lockbox_entity,
+ stringstream& ss)
+{
+
+ dout(10) << __func__ << " osd." << id << " uuid " << uuid << dendl;
+
+ map<string,bufferlist> cephx_caps = {
+ { "osd", _encode_cap("allow *") },
+ { "mon", _encode_cap("allow profile osd") },
+ { "mgr", _encode_cap("allow profile osd") }
+ };
+ map<string,bufferlist> lockbox_caps = {
+ { "mon", _encode_cap("allow command \"config-key get\" "
+ "with key=\"dm-crypt/osd/" +
+ stringify(uuid) +
+ "/luks\"") }
+ };
+
+ bool has_lockbox = !lockbox_secret.empty();
+
+ string cephx_name = "osd." + stringify(id);
+ string lockbox_name = "client.osd-lockbox." + stringify(uuid);
+
+ if (!cephx_entity.name.from_str(cephx_name)) {
+ dout(10) << __func__ << " invalid cephx entity '"
+ << cephx_name << "'" << dendl;
+ ss << "invalid cephx key entity '" << cephx_name << "'";
+ return -EINVAL;
+ }
+
+ if (has_lockbox) {
+ if (!lockbox_entity.name.from_str(lockbox_name)) {
+ dout(10) << __func__ << " invalid cephx lockbox entity '"
+ << lockbox_name << "'" << dendl;
+ ss << "invalid cephx lockbox entity '" << lockbox_name << "'";
+ return -EINVAL;
+ }
+ }
+
+ if (entity_is_pending(cephx_entity.name) ||
+ (has_lockbox && entity_is_pending(lockbox_entity.name))) {
+ // If we have pending entities for either the cephx secret or the
+ // lockbox secret, then our safest bet is to retry the command at
+ // a later time. These entities may be pending because an `osd new`
+ // command has been run (which is unlikely, due to the nature of
+ // the operation, which will force a paxos proposal), or (more likely)
+ // because a competing client created those entities before we handled
+ // the `osd new` command. Regardless, let's wait and see.
+ return -EAGAIN;
+ }
+
+ if (!is_valid_cephx_key(cephx_secret)) {
+ ss << "invalid cephx secret.";
+ return -EINVAL;
+ }
+
+ if (has_lockbox && !is_valid_cephx_key(lockbox_secret)) {
+ ss << "invalid cephx lockbox secret.";
+ return -EINVAL;
+ }
+
+ int err = _create_auth(cephx_entity.auth, cephx_secret, cephx_caps);
+ ceph_assert(0 == err);
+
+ bool cephx_is_idempotent = false, lockbox_is_idempotent = false;
+ err = exists_and_matches_entity(cephx_entity, true, ss);
+
+ if (err != -ENOENT) {
+ if (err < 0) {
+ return err;
+ }
+ ceph_assert(0 == err);
+ cephx_is_idempotent = true;
+ }
+
+ if (has_lockbox) {
+ err = _create_auth(lockbox_entity.auth, lockbox_secret, lockbox_caps);
+ ceph_assert(err == 0);
+ err = exists_and_matches_entity(lockbox_entity, true, ss);
+ if (err != -ENOENT) {
+ if (err < 0) {
+ return err;
+ }
+ ceph_assert(0 == err);
+ lockbox_is_idempotent = true;
+ }
+ }
+
+ if (cephx_is_idempotent && (!has_lockbox || lockbox_is_idempotent)) {
+ return EEXIST;
+ }
+
+ return 0;
+}
+
+int AuthMonitor::do_osd_new(
+ const auth_entity_t& cephx_entity,
+ const auth_entity_t& lockbox_entity,
+ bool has_lockbox)
+{
+ ceph_assert(paxos.is_plugged());
+
+ dout(10) << __func__ << " cephx " << cephx_entity.name
+ << " lockbox ";
+ if (has_lockbox) {
+ *_dout << lockbox_entity.name;
+ } else {
+ *_dout << "n/a";
+ }
+ *_dout << dendl;
+
+ // we must have validated before reaching this point.
+ // if keys exist, then this means they also match; otherwise we would
+ // have failed before calling this function.
+ bool cephx_exists = mon.key_server.contains(cephx_entity.name);
+
+ if (!cephx_exists) {
+ int err = add_entity(cephx_entity.name, cephx_entity.auth);
+ ceph_assert(0 == err);
+ }
+
+ if (has_lockbox &&
+ !mon.key_server.contains(lockbox_entity.name)) {
+ int err = add_entity(lockbox_entity.name, lockbox_entity.auth);
+ ceph_assert(0 == err);
+ }
+
+ // given we have paxos plugged, this will not result in a proposal
+ // being triggered, but it will still be needed so that we get our
+ // pending state encoded into the paxos' pending transaction.
+ propose_pending();
+ return 0;
+}
+
+template<typename CAP_ENTITY_CLASS>
+bool AuthMonitor::_was_parsing_fine(const string& entity, const string& caps,
+ ostream* out)
+{
+ CAP_ENTITY_CLASS cap;
+
+ if (!cap.parse(caps, out)) {
+ dout(20) << "Parsing " << entity << " caps failed. " << entity <<
+ " cap: " << caps << dendl;
+ return false;
+ }
+
+ return true;
+}
+
+bool AuthMonitor::valid_caps(const string& entity, const string& caps,
+ ostream *out)
+{
+ if (entity == "mon") {
+ return _was_parsing_fine<MonCap>(entity, caps, out);
+ }
+
+ if (!g_conf().get_val<bool>("mon_auth_validate_all_caps")) {
+ return true;
+ }
+
+ if (entity == "mgr") {
+ return _was_parsing_fine<MgrCap>(entity, caps, out);
+ } else if (entity == "osd") {
+ return _was_parsing_fine<OSDCap>(entity, caps, out);
+ } else if (entity == "mds") {
+ return _was_parsing_fine<MDSAuthCaps>(entity, caps, out);
+ } else {
+ if (out) {
+ *out << "unknown cap type '" << entity << "'";
+ }
+ return false;
+ }
+
+ return true;
+}
+
+bool AuthMonitor::valid_caps(const map<string, string>& caps, ostream *out)
+{
+ for (const auto& kv : caps) {
+ if (!valid_caps(kv.first, kv.second, out)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool AuthMonitor::prepare_command(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonCommand>();
+ stringstream ss, ds;
+ bufferlist rdata; // holds data that'll be printed on client's stdout
+ string rs;
+ int err = -EINVAL;
+
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ // ss has reason for failure
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+ return true;
+ }
+
+ string prefix;
+ vector<string> caps_vec;
+ map<string, string> ceph_caps;
+ string entity_name;
+ EntityName entity;
+
+ cmd_getval(cmdmap, "prefix", prefix);
+
+ string format = cmd_getval_or<string>(cmdmap, "format", "plain");
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+ MonSession *session = op->get_session();
+ if (!session) {
+ mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
+ return true;
+ }
+
+ cmd_getval(cmdmap, "caps", caps_vec);
+ // fs authorize command's can have odd number of caps arguments
+ if (prefix != "fs authorize") {
+ if ((caps_vec.size() % 2) != 0) {
+ ss << "bad capabilities request; odd number of arguments";
+ err = -EINVAL;
+ goto done;
+ } else {
+ for (size_t i = 0; i < caps_vec.size(); i += 2) {
+ ceph_caps.insert({caps_vec[i], caps_vec[i + 1]});
+ }
+ }
+ }
+
+ cmd_getval(cmdmap, "entity", entity_name);
+ if (!entity_name.empty() && !entity.from_str(entity_name)) {
+ ss << "bad entity name";
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (prefix == "auth import") {
+ bufferlist bl = m->get_data();
+ if (bl.length() == 0) {
+ ss << "auth import: no data supplied";
+ getline(ss, rs);
+ mon.reply_command(op, -EINVAL, rs, get_last_committed());
+ return true;
+ }
+ auto iter = bl.cbegin();
+ KeyRing keyring;
+ try {
+ decode(keyring, iter);
+ } catch (const ceph::buffer::error &ex) {
+ ss << "error decoding keyring" << " " << ex.what();
+ err = -EINVAL;
+ goto done;
+ }
+ err = import_keyring(keyring);
+ if (err < 0) {
+ ss << "auth import: no caps supplied";
+ getline(ss, rs);
+ mon.reply_command(op, -EINVAL, rs, get_last_committed());
+ return true;
+ }
+ err = 0;
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "auth add" && !entity_name.empty()) {
+ /* expected behavior:
+ * - if command reproduces current state, return 0.
+ * - if command adds brand new entity, handle it.
+ * - if command adds new state to existing entity, return error.
+ */
+ KeyServerData::Incremental auth_inc;
+ auth_inc.name = entity;
+ bufferlist bl = m->get_data();
+ bool has_keyring = (bl.length() > 0);
+
+ KeyRing new_keyring;
+ if (has_keyring) {
+ auto iter = bl.cbegin();
+ try {
+ decode(new_keyring, iter);
+ } catch (const ceph::buffer::error &ex) {
+ ss << "error decoding keyring";
+ err = -EINVAL;
+ goto done;
+ }
+ }
+
+ map<string, bufferlist> encoded_caps;
+ if (err = _check_and_encode_caps(ceph_caps, encoded_caps, ss); err < 0) {
+ goto done;
+ }
+
+ // are we about to have it?
+ if (entity_is_pending(entity)) {
+ wait_for_commit(op,
+ new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
+ return true;
+ }
+
+ // pull info out of provided keyring
+ EntityAuth new_inc;
+ if (has_keyring) {
+ if (!new_keyring.get_auth(auth_inc.name, new_inc)) {
+ ss << "key for " << auth_inc.name
+ << " not found in provided keyring";
+ err = -EINVAL;
+ goto done;
+ }
+ if (!encoded_caps.empty() && !new_inc.caps.empty()) {
+ ss << "caps cannot be specified both in keyring and in command";
+ err = -EINVAL;
+ goto done;
+ }
+ if (encoded_caps.empty()) {
+ encoded_caps = new_inc.caps;
+ }
+ }
+
+ err = exists_and_matches_entity(auth_inc.name, new_inc,
+ encoded_caps, has_keyring, ss);
+ // if entity/key/caps do not exist in the keyring, just fall through
+ // and add the entity; otherwise, make sure everything matches (in
+ // which case it's a no-op), because if not we must fail.
+ if (err != -ENOENT) {
+ if (err < 0) {
+ goto done;
+ }
+ // no-op.
+ ceph_assert(err == 0);
+ goto done;
+ }
+ err = 0;
+
+ // okay, add it.
+ if (!has_keyring) {
+ dout(10) << "AuthMonitor::prepare_command generating random key for "
+ << auth_inc.name << dendl;
+ new_inc.key.create(g_ceph_context, CEPH_CRYPTO_AES);
+ }
+ new_inc.caps = encoded_caps;
+
+ err = add_entity(auth_inc.name, new_inc);
+ ceph_assert(err == 0);
+
+ ss << "added key for " << auth_inc.name;
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if ((prefix == "auth get-or-create-pending" ||
+ prefix == "auth clear-pending" ||
+ prefix == "auth commit-pending")) {
+ if (mon.monmap->min_mon_release < ceph_release_t::quincy) {
+ err = -EPERM;
+ ss << "pending_keys are not available until after upgrading to quincy";
+ goto done;
+ }
+
+ EntityAuth entity_auth;
+ if (!mon.key_server.get_auth(entity, entity_auth)) {
+ ss << "entity " << entity << " does not exist";
+ err = -ENOENT;
+ goto done;
+ }
+
+ // is there an uncommitted pending_key? (or any change for this entity)
+ for (auto& p : pending_auth) {
+ if (p.inc_type == AUTH_DATA) {
+ KeyServerData::Incremental auth_inc;
+ auto q = p.auth_data.cbegin();
+ decode(auth_inc, q);
+ if (auth_inc.op == KeyServerData::AUTH_INC_ADD &&
+ auth_inc.name == entity) {
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+ }
+ }
+
+ if (prefix == "auth get-or-create-pending") {
+ KeyRing kr;
+ bool exists = false;
+ if (!entity_auth.pending_key.empty()) {
+ kr.add(entity, entity_auth.key, entity_auth.pending_key);
+ err = 0;
+ exists = true;
+ } else {
+ KeyServerData::Incremental auth_inc;
+ auth_inc.op = KeyServerData::AUTH_INC_ADD;
+ auth_inc.name = entity;
+ auth_inc.auth = entity_auth;
+ auth_inc.auth.pending_key.create(g_ceph_context, CEPH_CRYPTO_AES);
+ push_cephx_inc(auth_inc);
+ kr.add(entity, auth_inc.auth.key, auth_inc.auth.pending_key);
+ push_cephx_inc(auth_inc);
+ }
+ if (f) {
+ kr.encode_formatted("auth", f.get(), rdata);
+ } else {
+ kr.encode_plaintext(rdata);
+ }
+ if (exists) {
+ goto done;
+ }
+ } else if (prefix == "auth clear-pending") {
+ if (entity_auth.pending_key.empty()) {
+ err = 0;
+ goto done;
+ }
+ KeyServerData::Incremental auth_inc;
+ auth_inc.op = KeyServerData::AUTH_INC_ADD;
+ auth_inc.name = entity;
+ auth_inc.auth = entity_auth;
+ auth_inc.auth.pending_key.clear();
+ push_cephx_inc(auth_inc);
+ } else if (prefix == "auth commit-pending") {
+ if (entity_auth.pending_key.empty()) {
+ err = 0;
+ ss << "no pending key";
+ goto done;
+ }
+ KeyServerData::Incremental auth_inc;
+ auth_inc.op = KeyServerData::AUTH_INC_ADD;
+ auth_inc.name = entity;
+ auth_inc.auth = entity_auth;
+ auth_inc.auth.key = auth_inc.auth.pending_key;
+ auth_inc.auth.pending_key.clear();
+ push_cephx_inc(auth_inc);
+ }
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs, rdata,
+ get_last_committed() + 1));
+ return true;
+ } else if ((prefix == "auth get-or-create-key" ||
+ prefix == "auth get-or-create") &&
+ !entity_name.empty()) {
+ // auth get-or-create <name> [mon osdcapa osd osdcapb ...]
+
+ map<string, bufferlist> wanted_caps;
+ if (err = _check_and_encode_caps(ceph_caps, wanted_caps, ss); err < 0) {
+ goto done;
+ }
+
+ // do we have it?
+ EntityAuth entity_auth;
+ if (mon.key_server.get_auth(entity, entity_auth)) {
+ for (const auto &sys_cap : wanted_caps) {
+ if (entity_auth.caps.count(sys_cap.first) == 0 ||
+ !entity_auth.caps[sys_cap.first].contents_equal(sys_cap.second)) {
+ ss << "key for " << entity << " exists but cap " << sys_cap.first
+ << " does not match";
+ err = -EINVAL;
+ goto done;
+ }
+ }
+
+ if (prefix == "auth get-or-create-key") {
+ if (f) {
+ entity_auth.key.encode_formatted("auth", f.get(), rdata);
+ } else {
+ ds << entity_auth.key;
+ }
+ } else {
+ _encode_key(entity, entity_auth, rdata, f.get(), true,
+ &entity_auth.caps);
+ }
+ err = 0;
+ goto done;
+ }
+
+ // ...or are we about to?
+ for (vector<Incremental>::iterator p = pending_auth.begin();
+ p != pending_auth.end();
+ ++p) {
+ if (p->inc_type == AUTH_DATA) {
+ KeyServerData::Incremental auth_inc;
+ auto q = p->auth_data.cbegin();
+ decode(auth_inc, q);
+ if (auth_inc.op == KeyServerData::AUTH_INC_ADD &&
+ auth_inc.name == entity) {
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+ }
+ }
+
+ // create it
+ KeyServerData::Incremental auth_inc;
+ auth_inc.op = KeyServerData::AUTH_INC_ADD;
+ auth_inc.name = entity;
+ auth_inc.auth.key.create(g_ceph_context, CEPH_CRYPTO_AES);
+ auth_inc.auth.caps = wanted_caps;
+
+ push_cephx_inc(auth_inc);
+
+ if (prefix == "auth get-or-create-key") {
+ if (f) {
+ auth_inc.auth.key.encode_formatted("auth", f.get(), rdata);
+ } else {
+ ds << auth_inc.auth.key;
+ }
+ } else {
+ _encode_key(entity, auth_inc.auth, rdata, f.get(), false,
+ &wanted_caps);
+ }
+
+ rdata.append(ds);
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs, rdata,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "fs authorize") {
+ string filesystem;
+ cmd_getval(cmdmap, "filesystem", filesystem);
+ string mon_cap_string = "allow r";
+ string mds_cap_string, osd_cap_string;
+ string osd_cap_wanted = "r";
+
+ const Filesystem* fs = nullptr;
+ if (filesystem != "*" && filesystem != "all") {
+ const auto& fsmap = mon.mdsmon()->get_fsmap();
+ fs = fsmap.get_filesystem(filesystem);
+ if (fs == nullptr) {
+ ss << "filesystem " << filesystem << " does not exist.";
+ err = -EINVAL;
+ goto done;
+ } else {
+ mon_cap_string += " fsname=" + string(fs->get_mds_map().get_fs_name());
+ }
+ }
+
+ for (auto it = caps_vec.begin();
+ it != caps_vec.end() && (it + 1) != caps_vec.end();
+ it += 2) {
+ const string &path = *it;
+ const string &cap = *(it + 1);
+ bool root_squash = false;
+ if ((it + 2) != caps_vec.end() && *(it + 2) == "root_squash") {
+ root_squash = true;
+ ++it;
+ }
+
+ if (cap.compare(0, 2, "rw") == 0)
+ osd_cap_wanted = "rw";
+
+ char last='\0';
+ for (size_t i = 2; i < cap.size(); ++i) {
+ char c = cap.at(i);
+ if (last >= c) {
+ ss << "Permission flags (except 'rw') must be specified in alphabetical order.";
+ err = -EINVAL;
+ goto done;
+ }
+ switch (c) {
+ case 'p':
+ break;
+ case 's':
+ break;
+ default:
+ ss << "Unknown permission flag '" << c << "'.";
+ err = -EINVAL;
+ goto done;
+ }
+ }
+
+ mds_cap_string += mds_cap_string.empty() ? "" : ", ";
+ mds_cap_string += "allow " + cap;
+
+ if (filesystem != "*" && filesystem != "all" && fs != nullptr) {
+ mds_cap_string += " fsname=" + string(fs->get_mds_map().get_fs_name());
+ }
+
+ if (path != "/") {
+ mds_cap_string += " path=" + path;
+ }
+
+ if (root_squash) {
+ mds_cap_string += " root_squash";
+ }
+ }
+
+ osd_cap_string += osd_cap_string.empty() ? "" : ", ";
+ osd_cap_string += "allow " + osd_cap_wanted
+ + " tag " + pg_pool_t::APPLICATION_NAME_CEPHFS
+ + " data=" + filesystem;
+
+ map<string, bufferlist> encoded_caps;
+ map<string, string> newcaps = {
+ {"mon", mon_cap_string},
+ {"osd", osd_cap_string},
+ {"mds", mds_cap_string}
+ };
+ if (err = _check_and_encode_caps(newcaps, encoded_caps, ss); err < 0) {
+ goto done;
+ }
+
+ EntityAuth entity_auth;
+ if (mon.key_server.get_auth(entity, entity_auth)) {
+ int rv = _gen_wanted_caps(entity_auth, newcaps, ss);
+ ceph_assert(rv == CAPS_UPDATE_REQD or rv == CAPS_UPDATE_NOT_REQD or
+ rv == CAPS_PARSING_ERR);
+ if (rv == CAPS_PARSING_ERR) {
+ goto done;
+ } else if (rv == CAPS_UPDATE_NOT_REQD) {
+ ss << "no update for caps of " << entity;
+ err = 0;
+ goto done;
+ }
+
+ dout(20) << "caps that will be enforced -" << dendl;
+ for (const auto& it : newcaps) {
+ dout(20) << it.first << " cap = \"" << it.second << "\"" << dendl;
+ }
+
+ err = _update_caps(entity, newcaps, op, ss, ds, &rdata, f.get());
+ if (err == 0) {
+ return true;
+ } else {
+ goto done;
+ }
+ }
+
+ err = _create_entity(entity, newcaps, op, ss, ds, &rdata, f.get());
+ if (err == 0) {
+ return true;
+ } else {
+ goto done;
+ }
+ } else if (prefix == "auth caps" && !entity_name.empty()) {
+ err = _update_caps(entity, ceph_caps, op, ss, ds, &rdata, f.get());
+ if (err == 0) {
+ return true;
+ } else {
+ goto done;
+ }
+ } else if ((prefix == "auth del" || prefix == "auth rm") &&
+ !entity_name.empty()) {
+ KeyServerData::Incremental auth_inc;
+ auth_inc.name = entity;
+ if (!mon.key_server.contains(auth_inc.name)) {
+ err = 0;
+ goto done;
+ }
+ auth_inc.op = KeyServerData::AUTH_INC_DEL;
+ push_cephx_inc(auth_inc);
+
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+done:
+ rdata.append(ds);
+ getline(ss, rs, '\0');
+ mon.reply_command(op, err, rs, rdata, get_last_committed());
+ return false;
+}
+
+template<typename CAP_ENTITY_CLASS>
+AuthMonitor::caps_update AuthMonitor::_merge_caps(const string& cap_entity,
+ const string& new_cap_str, const string& cur_cap_str,
+ map<string, string>& newcaps, ostream& out)
+{
+ CAP_ENTITY_CLASS cur_cap, new_cap;
+
+ if (not cur_cap.parse(cur_cap_str, &out)) {
+ out << "error parsing " << cap_entity << "caps client already holds";
+ return CAPS_PARSING_ERR;
+ }
+ if (not new_cap.parse(new_cap_str, &out)) {
+ out << "error parsing new " << cap_entity << "caps";
+ return CAPS_PARSING_ERR;
+ }
+
+ if (cur_cap.merge(new_cap)) {
+ newcaps[cap_entity] = cur_cap.to_string();
+ return CAPS_UPDATE_REQD;
+ } else {
+ newcaps[cap_entity] = cur_cap_str;
+ return CAPS_UPDATE_NOT_REQD;
+ }
+}
+
+/* Generate the caps that should be present in the entity's auth keyring
+ * after running the "fs authorize" command. This is done by merging the
+ * caps already present in the client's auth keyring with the new caps
+ * provided by the user at "fs authorize" command.
+ */
+AuthMonitor::caps_update AuthMonitor::_gen_wanted_caps(EntityAuth& e_auth,
+ map<string, string>& newcaps, ostream& out)
+{
+ caps_update is_caps_update_reqd = CAPS_UPDATE_NOT_REQD;
+
+ if (e_auth.caps.empty()) {
+ return CAPS_UPDATE_REQD;
+ }
+
+ // new_cap_str is the new cap to be added to the current cap
+ for (const auto& [cap_entity, new_cap_str] : newcaps) {
+ string cur_cap_str; // current cap held by entity's auth keyring
+
+ if (e_auth.caps.count(cap_entity) == 0) {
+ is_caps_update_reqd = CAPS_UPDATE_REQD;
+ continue;
+ }
+
+ auto iter = e_auth.caps[cap_entity].cbegin();
+ decode(cur_cap_str, iter);
+ if (cur_cap_str == new_cap_str) {
+ continue;
+ }
+
+ if (cap_entity == "mon") {
+ is_caps_update_reqd = _merge_caps<MonCap>(cap_entity, new_cap_str,
+ cur_cap_str, newcaps, out);
+ } else if (cap_entity == "osd") {
+ is_caps_update_reqd = _merge_caps<OSDCap>(cap_entity, new_cap_str,
+ cur_cap_str, newcaps, out);
+ } else if (cap_entity == "mds") {
+ is_caps_update_reqd = _merge_caps<MDSAuthCaps>(cap_entity, new_cap_str,
+ cur_cap_str, newcaps, out);
+ }
+ }
+
+ return is_caps_update_reqd;
+}
+
+void AuthMonitor::_encode_keyring(KeyRing& kr, const EntityName& entity,
+ bufferlist& rdata, Formatter* fmtr, map<string, bufferlist>* caps)
+{
+ if (not fmtr) {
+ kr.encode_plaintext(rdata);
+ } else {
+ if (caps != nullptr) {
+ kr.set_caps(entity, *caps);
+ }
+ kr.encode_formatted("auth", fmtr, rdata);
+ }
+}
+
+void AuthMonitor::_encode_auth(const EntityName& entity,
+ const EntityAuth& eauth, bufferlist& rdata, Formatter* fmtr,
+ bool pending_key, map<string, bufferlist>* caps)
+{
+ KeyRing kr;
+
+ if (not pending_key) {
+ kr.add(entity, eauth);
+ } else {
+ kr.add(entity, eauth.key, eauth.pending_key);
+ }
+
+ _encode_keyring(kr, entity, rdata, fmtr, caps);
+}
+
+void AuthMonitor::_encode_key(const EntityName& entity,
+ const EntityAuth& eauth, bufferlist& rdata, Formatter* fmtr,
+ bool pending_key, map<string, bufferlist>* caps)
+{
+ KeyRing kr;
+
+ if (not pending_key) {
+ kr.add(entity, eauth.key);
+ } else {
+ kr.add(entity, eauth.key, eauth.pending_key);
+ }
+
+ _encode_keyring(kr, entity, rdata, fmtr, caps);
+}
+
+int AuthMonitor::_check_and_encode_caps(const map<string, string>& caps,
+ map<string, bufferlist>& encoded_caps, stringstream& ss)
+{
+ if (!valid_caps(caps, &ss)) {
+ return -EINVAL;
+ }
+
+ for (const auto& kv : caps) {
+ bufferlist cap;
+ encode(kv.second, cap);
+ encoded_caps[kv.first] = cap;
+ }
+
+ return 0;
+}
+
+// Update or create an entity, depending on the value of parameter
+// create_entity.
+//
+// Pass both, rdata as well as fmtr, to enable printing of the key after
+// update
+int AuthMonitor::_update_or_create_entity(const EntityName& entity,
+ const map<string, string>& caps, MonOpRequestRef op, stringstream& ss,
+ stringstream& ds, bufferlist* rdata, Formatter* fmtr, bool create_entity)
+{
+ KeyServerData::Incremental auth_inc;
+ auth_inc.name = entity;
+
+ // if entity to be created is already present.
+ if (create_entity &&
+ mon.key_server.get_auth(auth_inc.name, auth_inc.auth)) {
+ ss << "entity already exists" << auth_inc.name;
+ return -EEXIST;
+ }
+
+ // if entity to be updated is absent.
+ if (!create_entity &&
+ !mon.key_server.get_auth(auth_inc.name, auth_inc.auth)) {
+ ss << "couldn't find entry " << auth_inc.name;
+ return -ENOENT;
+ }
+
+ map<string, bufferlist> encoded_caps;
+ if (auto err = _check_and_encode_caps(caps, encoded_caps, ss); err < 0) {
+ return err;
+ }
+
+ auth_inc.op = KeyServerData::AUTH_INC_ADD;
+ auth_inc.auth.caps = encoded_caps;
+ if (create_entity) {
+ auth_inc.auth.key.create(g_ceph_context, CEPH_CRYPTO_AES);
+ }
+
+ push_cephx_inc(auth_inc);
+
+ if (!create_entity) {
+ ss << "updated caps for " << auth_inc.name;
+ }
+
+ if (rdata != nullptr) {
+ _encode_auth(entity, auth_inc.auth, *rdata, fmtr, false, &encoded_caps);
+ rdata->append(ds);
+ }
+
+ string rs;
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ *rdata, get_last_committed() + 1));
+ return 0;
+}
+
+int AuthMonitor::_update_caps(const EntityName& entity,
+ const map<string, string>& caps, MonOpRequestRef op, stringstream& ss,
+ stringstream& ds, bufferlist* rdata, Formatter* fmtr)
+{
+ return _update_or_create_entity(entity, caps, op, ss, ds, rdata, fmtr,
+ false);
+}
+
+int AuthMonitor::_create_entity(const EntityName& entity,
+ const map<string, string>& caps, MonOpRequestRef op, stringstream& ss,
+ stringstream& ds, bufferlist* rdata, Formatter* fmtr)
+{
+ return _update_or_create_entity(entity, caps, op, ss, ds, rdata, fmtr,
+ true);
+}
+
+bool AuthMonitor::prepare_global_id(MonOpRequestRef op)
+{
+ dout(10) << "AuthMonitor::prepare_global_id" << dendl;
+ increase_max_global_id();
+ return true;
+}
+
+bool AuthMonitor::prepare_used_pending_keys(MonOpRequestRef op)
+{
+ dout(10) << __func__ << " " << op << dendl;
+ auto m = op->get_req<MMonUsedPendingKeys>();
+ process_used_pending_keys(m->used_pending_keys);
+ return true;
+}
+
+bool AuthMonitor::_upgrade_format_to_dumpling()
+{
+ dout(1) << __func__ << " upgrading from format 0 to 1" << dendl;
+ ceph_assert(format_version == 0);
+
+ bool changed = false;
+ map<EntityName, EntityAuth>::iterator p;
+ for (p = mon.key_server.secrets_begin();
+ p != mon.key_server.secrets_end();
+ ++p) {
+ // grab mon caps, if any
+ string mon_caps;
+ if (p->second.caps.count("mon") == 0)
+ continue;
+ try {
+ auto it = p->second.caps["mon"].cbegin();
+ decode(mon_caps, it);
+ }
+ catch (const ceph::buffer::error&) {
+ dout(10) << __func__ << " unable to parse mon cap for "
+ << p->first << dendl;
+ continue;
+ }
+
+ string n = p->first.to_str();
+ string new_caps;
+
+ // set daemon profiles
+ if ((p->first.is_osd() || p->first.is_mds()) &&
+ mon_caps == "allow rwx") {
+ new_caps = string("allow profile ") + string(p->first.get_type_name());
+ }
+
+ // update bootstrap keys
+ if (n == "client.bootstrap-osd") {
+ new_caps = "allow profile bootstrap-osd";
+ }
+ if (n == "client.bootstrap-mds") {
+ new_caps = "allow profile bootstrap-mds";
+ }
+
+ if (new_caps.length() > 0) {
+ dout(5) << __func__ << " updating " << p->first << " mon cap from "
+ << mon_caps << " to " << new_caps << dendl;
+
+ bufferlist bl;
+ encode(new_caps, bl);
+
+ KeyServerData::Incremental auth_inc;
+ auth_inc.name = p->first;
+ auth_inc.auth = p->second;
+ auth_inc.auth.caps["mon"] = bl;
+ auth_inc.op = KeyServerData::AUTH_INC_ADD;
+ push_cephx_inc(auth_inc);
+ changed = true;
+ }
+ }
+ return changed;
+}
+
+bool AuthMonitor::_upgrade_format_to_luminous()
+{
+ dout(1) << __func__ << " upgrading from format 1 to 2" << dendl;
+ ceph_assert(format_version == 1);
+
+ bool changed = false;
+ map<EntityName, EntityAuth>::iterator p;
+ for (p = mon.key_server.secrets_begin();
+ p != mon.key_server.secrets_end();
+ ++p) {
+ string n = p->first.to_str();
+
+ string newcap;
+ if (n == "client.admin") {
+ // admin gets it all
+ newcap = "allow *";
+ } else if (n.find("osd.") == 0 ||
+ n.find("mds.") == 0 ||
+ n.find("mon.") == 0) {
+ // daemons follow their profile
+ string type = n.substr(0, 3);
+ newcap = "allow profile " + type;
+ } else if (p->second.caps.count("mon")) {
+ // if there are any mon caps, give them 'r' mgr caps
+ newcap = "allow r";
+ }
+
+ if (newcap.length() > 0) {
+ dout(5) << " giving " << n << " mgr '" << newcap << "'" << dendl;
+ bufferlist bl;
+ encode(newcap, bl);
+
+ EntityAuth auth = p->second;
+ auth.caps["mgr"] = bl;
+
+ add_entity(p->first, auth);
+ changed = true;
+ }
+
+ if (n.find("mgr.") == 0 &&
+ p->second.caps.count("mon")) {
+ // the kraken ceph-mgr@.service set the mon cap to 'allow *'.
+ auto blp = p->second.caps["mon"].cbegin();
+ string oldcaps;
+ decode(oldcaps, blp);
+ if (oldcaps == "allow *") {
+ dout(5) << " fixing " << n << " mon cap to 'allow profile mgr'"
+ << dendl;
+ bufferlist bl;
+ encode("allow profile mgr", bl);
+
+ EntityAuth auth = p->second;
+ auth.caps["mon"] = bl;
+ add_entity(p->first, p->second);
+ changed = true;
+ }
+ }
+ }
+
+ // add bootstrap key if it does not already exist
+ // (might have already been get-or-create'd by
+ // ceph-create-keys)
+ EntityName bootstrap_mgr_name;
+ int r = bootstrap_mgr_name.from_str("client.bootstrap-mgr");
+ ceph_assert(r);
+ if (!mon.key_server.contains(bootstrap_mgr_name)) {
+
+ EntityName name = bootstrap_mgr_name;
+ EntityAuth auth;
+ encode("allow profile bootstrap-mgr", auth.caps["mon"]);
+ auth.key.create(g_ceph_context, CEPH_CRYPTO_AES);
+ add_entity(name, auth);
+ changed = true;
+ }
+ return changed;
+}
+
+bool AuthMonitor::_upgrade_format_to_mimic()
+{
+ dout(1) << __func__ << " upgrading from format 2 to 3" << dendl;
+ ceph_assert(format_version == 2);
+
+ list<pair<EntityName,EntityAuth> > auth_lst;
+ _generate_bootstrap_keys(&auth_lst);
+
+ bool changed = false;
+ for (auto &p : auth_lst) {
+ if (mon.key_server.contains(p.first)) {
+ continue;
+ }
+ int err = add_entity(p.first, p.second);
+ ceph_assert(err == 0);
+ changed = true;
+ }
+
+ return changed;
+}
+
+void AuthMonitor::upgrade_format()
+{
+ constexpr unsigned int FORMAT_NONE = 0;
+ constexpr unsigned int FORMAT_DUMPLING = 1;
+ constexpr unsigned int FORMAT_LUMINOUS = 2;
+ constexpr unsigned int FORMAT_MIMIC = 3;
+
+ // when upgrading from the current format to a new format, ensure that
+ // the new format doesn't break the older format. I.e., if a given format N
+ // changes or adds something, ensure that when upgrading from N-1 to N+1, we
+ // still observe the changes for format N if those have not been superseded
+ // by N+1.
+
+ unsigned int current = FORMAT_MIMIC;
+ if (!mon.get_quorum_mon_features().contains_all(
+ ceph::features::mon::FEATURE_LUMINOUS)) {
+ // pre-luminous quorum
+ current = FORMAT_DUMPLING;
+ } else if (!mon.get_quorum_mon_features().contains_all(
+ ceph::features::mon::FEATURE_MIMIC)) {
+ // pre-mimic quorum
+ current = FORMAT_LUMINOUS;
+ }
+ if (format_version >= current) {
+ dout(20) << __func__ << " format " << format_version
+ << " is current" << dendl;
+ return;
+ }
+
+ // perform a rolling upgrade of the new format, if necessary.
+ // i.e., if we are moving from format NONE to MIMIC, we will first upgrade
+ // to DUMPLING, then to LUMINOUS, and finally to MIMIC, in several different
+ // proposals.
+
+ bool changed = false;
+ if (format_version == FORMAT_NONE) {
+ changed = _upgrade_format_to_dumpling();
+
+ } else if (format_version == FORMAT_DUMPLING) {
+ changed = _upgrade_format_to_luminous();
+ } else if (format_version == FORMAT_LUMINOUS) {
+ changed = _upgrade_format_to_mimic();
+ }
+
+ if (changed) {
+ // note new format
+ dout(10) << __func__ << " proposing update from format " << format_version
+ << " -> " << current << dendl;
+ format_version = current;
+ propose_pending();
+ }
+}
+
+void AuthMonitor::dump_info(Formatter *f)
+{
+ /*** WARNING: do not include any privileged information here! ***/
+ f->open_object_section("auth");
+ f->dump_unsigned("first_committed", get_first_committed());
+ f->dump_unsigned("last_committed", get_last_committed());
+ f->dump_unsigned("num_secrets", mon.key_server.get_num_secrets());
+ f->close_section();
+}
diff --git a/src/mon/AuthMonitor.h b/src/mon/AuthMonitor.h
new file mode 100644
index 000000000..0495e20c4
--- /dev/null
+++ b/src/mon/AuthMonitor.h
@@ -0,0 +1,283 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_AUTHMONITOR_H
+#define CEPH_AUTHMONITOR_H
+
+#include <map>
+#include <set>
+
+#include "global/global_init.h"
+#include "include/ceph_features.h"
+#include "include/types.h"
+#include "mon/PaxosService.h"
+#include "mon/MonitorDBStore.h"
+
+class MAuth;
+class KeyRing;
+class Monitor;
+
+#define MIN_GLOBAL_ID 0x1000
+
+class AuthMonitor : public PaxosService {
+
+public:
+ typedef enum {
+ CAPS_UPDATE_NOT_REQD, CAPS_UPDATE_REQD, CAPS_PARSING_ERR
+ } caps_update;
+
+ enum IncType {
+ GLOBAL_ID,
+ AUTH_DATA,
+ };
+ struct Incremental {
+ IncType inc_type;
+ uint64_t max_global_id;
+ uint32_t auth_type;
+ ceph::buffer::list auth_data;
+
+ Incremental() : inc_type(GLOBAL_ID), max_global_id(0), auth_type(0) {}
+
+ void encode(ceph::buffer::list& bl, uint64_t features=-1) const {
+ using ceph::encode;
+ ENCODE_START(2, 2, bl);
+ __u32 _type = (__u32)inc_type;
+ encode(_type, bl);
+ if (_type == GLOBAL_ID) {
+ encode(max_global_id, bl);
+ } else {
+ encode(auth_type, bl);
+ encode(auth_data, bl);
+ }
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ __u32 _type;
+ decode(_type, bl);
+ inc_type = (IncType)_type;
+ ceph_assert(inc_type >= GLOBAL_ID && inc_type <= AUTH_DATA);
+ if (_type == GLOBAL_ID) {
+ decode(max_global_id, bl);
+ } else {
+ decode(auth_type, bl);
+ decode(auth_data, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+ void dump(ceph::Formatter *f) const {
+ f->dump_int("type", inc_type);
+ f->dump_int("max_global_id", max_global_id);
+ f->dump_int("auth_type", auth_type);
+ f->dump_int("auth_data_len", auth_data.length());
+ }
+ static void generate_test_instances(std::list<Incremental*>& ls) {
+ ls.push_back(new Incremental);
+ ls.push_back(new Incremental);
+ ls.back()->inc_type = GLOBAL_ID;
+ ls.back()->max_global_id = 1234;
+ ls.push_back(new Incremental);
+ ls.back()->inc_type = AUTH_DATA;
+ ls.back()->auth_type = 12;
+ ls.back()->auth_data.append("foo");
+ }
+ };
+
+ struct auth_entity_t {
+ EntityName name;
+ EntityAuth auth;
+ };
+
+
+private:
+ std::vector<Incremental> pending_auth;
+ uint64_t max_global_id;
+ uint64_t last_allocated_id;
+
+ // these are protected by mon->auth_lock
+ int mon_num = 0, mon_rank = 0;
+
+ bool _upgrade_format_to_dumpling();
+ bool _upgrade_format_to_luminous();
+ bool _upgrade_format_to_mimic();
+ void upgrade_format() override;
+
+ void export_keyring(KeyRing& keyring);
+ int import_keyring(KeyRing& keyring);
+
+ void push_cephx_inc(KeyServerData::Incremental& auth_inc) {
+ Incremental inc;
+ inc.inc_type = AUTH_DATA;
+ encode(auth_inc, inc.auth_data);
+ inc.auth_type = CEPH_AUTH_CEPHX;
+ pending_auth.push_back(inc);
+ }
+
+ template<typename CAP_ENTITY_CLASS>
+ bool _was_parsing_fine(const std::string& entity, const std::string& caps,
+ std::ostream* out);
+ /* validate mon/osd/mgr/mds caps; fail on unrecognized service/type */
+ bool valid_caps(const std::string& entity, const std::string& caps,
+ std::ostream *out);
+ bool valid_caps(const std::string& type, const ceph::buffer::list& bl, std::ostream *out) {
+ auto p = bl.begin();
+ std::string v;
+ try {
+ using ceph::decode;
+ decode(v, p);
+ } catch (ceph::buffer::error& e) {
+ *out << "corrupt capability encoding";
+ return false;
+ }
+ return valid_caps(type, v, out);
+ }
+ bool valid_caps(const std::map<std::string, std::string>& caps,
+ std::ostream *out);
+
+ void on_active() override;
+ bool should_propose(double& delay) override;
+ void get_initial_keyring(KeyRing *keyring);
+ void create_initial_keys(KeyRing *keyring);
+ void create_initial() override;
+ void update_from_paxos(bool *need_bootstrap) override;
+ void create_pending() override; // prepare a new pending
+ bool prepare_global_id(MonOpRequestRef op);
+ bool _should_increase_max_global_id(); ///< called under mon->auth_lock
+ void increase_max_global_id();
+ uint64_t assign_global_id(bool should_increase_max);
+public:
+ uint64_t _assign_global_id(); ///< called under mon->auth_lock
+ void _set_mon_num_rank(int num, int rank); ///< called under mon->auth_lock
+
+private:
+ bool prepare_used_pending_keys(MonOpRequestRef op);
+
+ // propose pending update to peers
+ void encode_pending(MonitorDBStore::TransactionRef t) override;
+ void encode_full(MonitorDBStore::TransactionRef t) override;
+ version_t get_trim_to() const override;
+
+ bool preprocess_query(MonOpRequestRef op) override; // true if processed.
+ bool prepare_update(MonOpRequestRef op) override;
+
+ bool prep_auth(MonOpRequestRef op, bool paxos_writable);
+
+ bool preprocess_command(MonOpRequestRef op);
+ bool prepare_command(MonOpRequestRef op);
+
+ void _encode_keyring(KeyRing& kr, const EntityName& entity,
+ bufferlist& rdata, Formatter* fmtr,
+ std::map<std::string, bufferlist>* wanted_caps=nullptr);
+ void _encode_auth(const EntityName& entity, const EntityAuth& eauth,
+ bufferlist& rdata, Formatter* fmtr, bool pending_key=false,
+ std::map<std::string, bufferlist>* caps=nullptr);
+ void _encode_key(const EntityName& entity, const EntityAuth& eauth,
+ bufferlist& rdata, Formatter* fmtr, bool pending_key=false,
+ std::map<std::string, bufferlist>* caps=nullptr);
+
+ int _check_and_encode_caps(const std::map<std::string, std::string>& caps,
+ std::map<std::string, bufferlist>& encoded_caps, std::stringstream& ss);
+
+ int _update_or_create_entity(const EntityName& entity,
+ const std::map<std::string, std::string>& caps, MonOpRequestRef op,
+ std::stringstream& ss, std::stringstream& ds, bufferlist* rdata=nullptr,
+ Formatter* fmtr=nullptr, bool create_entity=false);
+ int _create_entity(const EntityName& entity,
+ const std::map<std::string, std::string>& caps, MonOpRequestRef op,
+ std::stringstream& ss, std::stringstream& ds, bufferlist* rdata,
+ Formatter* fmtr);
+ int _update_caps(const EntityName& entity,
+ const std::map<std::string, std::string>& caps, MonOpRequestRef op,
+ std::stringstream& ss, std::stringstream& ds, bufferlist* rdata,
+ Formatter* fmtr);
+
+ caps_update _gen_wanted_caps(EntityAuth& e_auth,
+ std::map<std::string, std::string>& newcaps, std::ostream& out);
+ template<typename CAP_ENTITY_CLASS>
+ caps_update _merge_caps(const std::string& cap_entity,
+ const std::string& new_cap_str, const std::string& cur_cap_str,
+ std::map<std::string, std::string>& newcaps, std::ostream& out);
+
+ bool check_rotate();
+ void process_used_pending_keys(const std::map<EntityName,CryptoKey>& keys);
+
+ bool entity_is_pending(EntityName& entity);
+ int exists_and_matches_entity(
+ const auth_entity_t& entity,
+ bool has_secret,
+ std::stringstream& ss);
+ int exists_and_matches_entity(
+ const EntityName& name,
+ const EntityAuth& auth,
+ const std::map<std::string,ceph::buffer::list>& caps,
+ bool has_secret,
+ std::stringstream& ss);
+ int remove_entity(const EntityName &entity);
+ int add_entity(
+ const EntityName& name,
+ const EntityAuth& auth);
+
+ public:
+ AuthMonitor(Monitor &mn, Paxos &p, const std::string& service_name)
+ : PaxosService(mn, p, service_name),
+ max_global_id(0),
+ last_allocated_id(0)
+ {}
+
+ void pre_auth(MAuth *m);
+
+ void tick() override; // check state, take actions
+
+ int validate_osd_destroy(
+ int32_t id,
+ const uuid_d& uuid,
+ EntityName& cephx_entity,
+ EntityName& lockbox_entity,
+ std::stringstream& ss);
+ void do_osd_destroy(
+ const EntityName& cephx_entity,
+ const EntityName& lockbox_entity);
+
+ int do_osd_new(
+ const auth_entity_t& cephx_entity,
+ const auth_entity_t& lockbox_entity,
+ bool has_lockbox);
+ int validate_osd_new(
+ int32_t id,
+ const uuid_d& uuid,
+ const std::string& cephx_secret,
+ const std::string& lockbox_secret,
+ auth_entity_t& cephx_entity,
+ auth_entity_t& lockbox_entity,
+ std::stringstream& ss);
+
+ void dump_info(ceph::Formatter *f);
+
+ bool is_valid_cephx_key(const std::string& k) {
+ if (k.empty())
+ return false;
+
+ EntityAuth ea;
+ try {
+ ea.key.decode_base64(k);
+ return true;
+ } catch (ceph::buffer::error& e) { /* fallthrough */ }
+ return false;
+ }
+};
+
+
+WRITE_CLASS_ENCODER_FEATURES(AuthMonitor::Incremental)
+
+#endif
diff --git a/src/mon/CMakeLists.txt b/src/mon/CMakeLists.txt
new file mode 100644
index 000000000..3f2b86053
--- /dev/null
+++ b/src/mon/CMakeLists.txt
@@ -0,0 +1,46 @@
+set(lib_mon_srcs
+ ${CMAKE_SOURCE_DIR}/src/auth/cephx/CephxKeyServer.cc
+ ${CMAKE_SOURCE_DIR}/src/auth/cephx/CephxServiceHandler.cc
+ ${CMAKE_SOURCE_DIR}/src/auth/AuthServiceHandler.cc
+ Paxos.cc
+ PaxosService.cc
+ OSDMonitor.cc
+ MDSMonitor.cc
+ CommandHandler.cc
+ FSCommands.cc
+ MgrMonitor.cc
+ MgrStatMonitor.cc
+ Monitor.cc
+ MonmapMonitor.cc
+ LogMonitor.cc
+ AuthMonitor.cc
+ ConfigMap.cc
+ ConfigMonitor.cc
+ Elector.cc
+ ElectionLogic.cc
+ ConnectionTracker.cc
+ HealthMonitor.cc
+ KVMonitor.cc
+ ../mds/MDSAuthCaps.cc
+ ../mgr/mgr_commands.cc
+ ../osd/OSDCap.cc
+ ../mgr/MgrCap.cc)
+
+if(TARGET mgr_cap_obj)
+ list(APPEND lib_mon_srcs $<TARGET_OBJECTS:mgr_cap_obj>)
+endif()
+
+if(HAVE_GSSAPI)
+ list(APPEND lib_mon_srcs
+ ${CMAKE_SOURCE_DIR}/src/auth/krb/KrbServiceHandler.cpp)
+endif()
+
+add_library(mon STATIC
+ ${lib_mon_srcs})
+target_link_libraries(mon
+ kv
+ heap_profiler
+ ${FMT_LIB})
+if(WITH_JAEGER)
+ target_link_libraries(mon jaeger_base)
+endif()
diff --git a/src/mon/CommandHandler.cc b/src/mon/CommandHandler.cc
new file mode 100644
index 000000000..903d35927
--- /dev/null
+++ b/src/mon/CommandHandler.cc
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat Ltd
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "CommandHandler.h"
+
+#include "common/strtol.h"
+#include "include/ceph_assert.h"
+
+#include <ostream>
+#include <string>
+#include <string_view>
+
+int CommandHandler::parse_bool(std::string_view str, bool* result, std::ostream& ss)
+{
+ ceph_assert(result != nullptr);
+
+ std::string interr;
+ int64_t n = strict_strtoll(str.data(), 10, &interr);
+
+ if (str == "false" || str == "no"
+ || (interr.length() == 0 && n == 0)) {
+ *result = false;
+ return 0;
+ } else if (str == "true" || str == "yes"
+ || (interr.length() == 0 && n == 1)) {
+ *result = true;
+ return 0;
+ } else {
+ ss << "value must be false|no|0 or true|yes|1";
+ return -EINVAL;
+ }
+}
diff --git a/src/mon/CommandHandler.h b/src/mon/CommandHandler.h
new file mode 100644
index 000000000..167b4587f
--- /dev/null
+++ b/src/mon/CommandHandler.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat Ltd
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef COMMAND_HANDLER_H_
+#define COMMAND_HANDLER_H_
+
+#include <ostream>
+#include <string_view>
+
+class CommandHandler
+{
+public:
+ /**
+ * Parse true|yes|1 style boolean string from `bool_str`
+ * `result` must be non-null.
+ * `ss` will be populated with error message on error.
+ *
+ * @return 0 on success, else -EINVAL
+ */
+ int parse_bool(std::string_view str, bool* result, std::ostream& ss);
+};
+
+#endif
diff --git a/src/mon/ConfigMap.cc b/src/mon/ConfigMap.cc
new file mode 100644
index 000000000..86528c1de
--- /dev/null
+++ b/src/mon/ConfigMap.cc
@@ -0,0 +1,348 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/algorithm/string/split.hpp>
+
+#include "ConfigMap.h"
+#include "crush/CrushWrapper.h"
+#include "common/entity_name.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#include "common/dout.h"
+
+using namespace std::literals;
+
+using std::cerr;
+using std::cout;
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::setfill;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+using std::unique_ptr;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+
+int MaskedOption::get_precision(const CrushWrapper *crush)
+{
+ // 0 = most precise
+ if (mask.location_type.size()) {
+ int r = crush->get_type_id(mask.location_type);
+ if (r >= 0) {
+ return r;
+ }
+ // bad type name, ignore it
+ }
+ int num_types = crush->get_num_type_names();
+ if (mask.device_class.size()) {
+ return num_types;
+ }
+ return num_types + 1;
+}
+
+void OptionMask::dump(Formatter *f) const
+{
+ if (location_type.size()) {
+ f->dump_string("location_type", location_type);
+ f->dump_string("location_value", location_value);
+ }
+ if (device_class.size()) {
+ f->dump_string("device_class", device_class);
+ }
+}
+
+void MaskedOption::dump(Formatter *f) const
+{
+ f->dump_string("name", localized_name);
+ f->dump_string("value", raw_value);
+ f->dump_string("level", Option::level_to_str(opt->level));
+ f->dump_bool("can_update_at_runtime", opt->can_update_at_runtime());
+ f->dump_string("mask", mask.to_str());
+ mask.dump(f);
+}
+
+ostream& operator<<(ostream& out, const MaskedOption& o)
+{
+ out << o.localized_name;
+ if (o.mask.location_type.size()) {
+ out << "@" << o.mask.location_type << '=' << o.mask.location_value;
+ }
+ if (o.mask.device_class.size()) {
+ out << "@class=" << o.mask.device_class;
+ }
+ return out;
+}
+
+// ----------
+
+void Section::dump(Formatter *f) const
+{
+ for (auto& i : options) {
+ f->dump_object(i.first.c_str(), i.second);
+ }
+}
+
+std::string Section::get_minimal_conf() const
+{
+ std::string r;
+ for (auto& i : options) {
+ if (i.second.opt->has_flag(Option::FLAG_NO_MON_UPDATE) ||
+ i.second.opt->has_flag(Option::FLAG_MINIMAL_CONF)) {
+ if (i.second.mask.empty()) {
+ r += "\t"s + i.first + " = " + i.second.raw_value + "\n";
+ } else {
+ r += "\t# masked option excluded: " + i.first + " = " +
+ i.second.raw_value + "\n";
+ }
+ }
+ }
+ return r;
+}
+
+
+// ------------
+
+void ConfigMap::dump(Formatter *f) const
+{
+ f->dump_object("global", global);
+ f->open_object_section("by_type");
+ for (auto& i : by_type) {
+ f->dump_object(i.first.c_str(), i.second);
+ }
+ f->close_section();
+ f->open_object_section("by_id");
+ for (auto& i : by_id) {
+ f->dump_object(i.first.c_str(), i.second);
+ }
+ f->close_section();
+}
+
+std::map<std::string,std::string,std::less<>>
+ConfigMap::generate_entity_map(
+ const EntityName& name,
+ const map<std::string,std::string>& crush_location,
+ const CrushWrapper *crush,
+ const std::string& device_class,
+ std::unordered_map<std::string, ValueSource> *src)
+{
+ // global, then by type, then by name prefix component(s), then name.
+ // name prefix components are .-separated,
+ // e.g. client.a.b.c -> [global, client, client.a, client.a.b, client.a.b.c]
+ vector<pair<string,Section*>> sections = { make_pair("global", &global) };
+ auto p = by_type.find(name.get_type_name());
+ if (p != by_type.end()) {
+ sections.emplace_back(name.get_type_name(), &p->second);
+ }
+ vector<std::string> name_bits;
+ boost::split(name_bits, name.to_str(), [](char c){ return c == '.'; });
+ std::string tname;
+ for (unsigned p = 0; p < name_bits.size(); ++p) {
+ if (p) {
+ tname += '.';
+ }
+ tname += name_bits[p];
+ auto q = by_id.find(tname);
+ if (q != by_id.end()) {
+ sections.push_back(make_pair(tname, &q->second));
+ }
+ }
+ std::map<std::string,std::string,std::less<>> out;
+ MaskedOption *prev = nullptr;
+ for (auto s : sections) {
+ for (auto& i : s.second->options) {
+ auto& o = i.second;
+ // match against crush location, class
+ if (o.mask.device_class.size() &&
+ o.mask.device_class != device_class) {
+ continue;
+ }
+ if (o.mask.location_type.size()) {
+ auto p = crush_location.find(o.mask.location_type);
+ if (p == crush_location.end() ||
+ p->second != o.mask.location_value) {
+ continue;
+ }
+ }
+ if (prev && prev->opt->name != i.first) {
+ prev = nullptr;
+ }
+ if (prev &&
+ prev->get_precision(crush) < o.get_precision(crush)) {
+ continue;
+ }
+ out[i.first] = o.raw_value;
+ if (src) {
+ (*src).emplace(i.first, ConfigMap::ValueSource(s.first, &o));
+ }
+ prev = &o;
+ }
+ }
+ return out;
+}
+
+bool ConfigMap::parse_mask(
+ const std::string& who,
+ std::string *section,
+ OptionMask *mask)
+{
+ vector<std::string> split;
+ boost::split(split, who, [](char c){ return c == '/'; });
+ for (unsigned j = 0; j < split.size(); ++j) {
+ auto& i = split[j];
+ if (i == "global") {
+ *section = "global";
+ continue;
+ }
+ size_t delim = i.find(':');
+ if (delim != std::string::npos) {
+ string k = i.substr(0, delim);
+ if (k == "class") {
+ mask->device_class = i.substr(delim + 1);
+ } else {
+ mask->location_type = k;
+ mask->location_value = i.substr(delim + 1);
+ }
+ continue;
+ }
+ string type, id;
+ auto dotpos = i.find('.');
+ if (dotpos != std::string::npos) {
+ type = i.substr(0, dotpos);
+ id = i.substr(dotpos + 1);
+ } else {
+ type = i;
+ }
+ if (EntityName::str_to_ceph_entity_type(type) == CEPH_ENTITY_TYPE_ANY) {
+ return false;
+ }
+ *section = i;
+ }
+ return true;
+}
+
+void ConfigMap::parse_key(
+ const std::string& key,
+ std::string *name,
+ std::string *who)
+{
+ auto last_slash = key.rfind('/');
+ if (last_slash == std::string::npos) {
+ *name = key;
+ } else if (auto mgrpos = key.find("/mgr/"); mgrpos != std::string::npos) {
+ *name = key.substr(mgrpos + 1);
+ *who = key.substr(0, mgrpos);
+ } else {
+ *name = key.substr(last_slash + 1);
+ *who = key.substr(0, last_slash);
+ }
+}
+
+int ConfigMap::add_option(
+ CephContext *cct,
+ const std::string& name,
+ const std::string& who,
+ const std::string& orig_value,
+ std::function<const Option *(const std::string&)> get_opt)
+{
+ const Option *opt = get_opt(name);
+ if (!opt) {
+ ldout(cct, 10) << __func__ << " unrecognized option '" << name << "'" << dendl;
+ stray_options.push_back(
+ std::unique_ptr<Option>(
+ new Option(name, Option::TYPE_STR, Option::LEVEL_UNKNOWN)));
+ opt = stray_options.back().get();
+ }
+
+ string err;
+ string value = orig_value;
+ int r = opt->pre_validate(&value, &err);
+ if (r < 0) {
+ ldout(cct, 10) << __func__ << " pre-validate failed on '" << name << "' = '"
+ << value << "' for " << name << dendl;
+ }
+
+ int ret = 0;
+ MaskedOption mopt(opt);
+ mopt.raw_value = value;
+ mopt.localized_name = name;
+ string section_name;
+ if (who.size() &&
+ !ConfigMap::parse_mask(who, &section_name, &mopt.mask)) {
+ lderr(cct) << __func__ << " invalid mask for option " << name << " mask " << who
+ << dendl;
+ ret = -EINVAL;
+ } else if (opt->has_flag(Option::FLAG_NO_MON_UPDATE)) {
+ ldout(cct, 10) << __func__ << " NO_MON_UPDATE option '"
+ << name << "' = '" << value << "' for " << name
+ << dendl;
+ ret = -EINVAL;
+ } else {
+ Section *section = &global;;
+ if (section_name.size() && section_name != "global") {
+ if (section_name.find('.') != std::string::npos) {
+ section = &by_id[section_name];
+ } else {
+ section = &by_type[section_name];
+ }
+ }
+ section->options.insert(make_pair(name, std::move(mopt)));
+ }
+ return ret;
+}
+
+
+// --------------
+
+void ConfigChangeSet::dump(Formatter *f) const
+{
+ f->dump_int("version", version);
+ f->dump_stream("timestamp") << stamp;
+ f->dump_string("name", name);
+ f->open_array_section("changes");
+ for (auto& i : diff) {
+ f->open_object_section("change");
+ f->dump_string("name", i.first);
+ if (i.second.first) {
+ f->dump_string("previous_value", *i.second.first);
+ }
+ if (i.second.second) {
+ f->dump_string("new_value", *i.second.second);
+ }
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void ConfigChangeSet::print(ostream& out) const
+{
+ out << "--- " << version << " --- " << stamp;
+ if (name.size()) {
+ out << " --- " << name;
+ }
+ out << " ---\n";
+ for (auto& i : diff) {
+ if (i.second.first) {
+ out << "- " << i.first << " = " << *i.second.first << "\n";
+ }
+ if (i.second.second) {
+ out << "+ " << i.first << " = " << *i.second.second << "\n";
+ }
+ }
+}
diff --git a/src/mon/ConfigMap.h b/src/mon/ConfigMap.h
new file mode 100644
index 000000000..5a14d089a
--- /dev/null
+++ b/src/mon/ConfigMap.h
@@ -0,0 +1,172 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <optional>
+#include <ostream>
+#include <string>
+
+#include "include/utime.h"
+#include "common/options.h"
+#include "common/entity_name.h"
+
+class CrushWrapper;
+
+// the precedence is thus:
+//
+// global
+// crush location (coarse to fine, ordered by type id)
+// daemon type (e.g., osd)
+// device class (osd only)
+// crush location (coarse to fine, ordered by type id)
+// daemon name (e.g., mds.foo)
+//
+// Note that this means that if we have
+//
+// config/host:foo/a = 1
+// config/osd/rack:foo/a = 2
+//
+// then we get a = 2. The osd-level config wins, even though rack
+// is less precise than host, because the crush limiters are only
+// resolved within a section (global, per-daemon, per-instance).
+
+struct OptionMask {
+ std::string location_type, location_value; ///< matches crush_location
+ std::string device_class; ///< matches device class
+
+ bool empty() const {
+ return location_type.size() == 0
+ && location_value.size() == 0
+ && device_class.size() == 0;
+ }
+
+ std::string to_str() const {
+ std::string r;
+ if (location_type.size()) {
+ r += location_type + ":" + location_value;
+ }
+ if (device_class.size()) {
+ if (r.size()) {
+ r += "/";
+ }
+ r += "class:" + device_class;
+ }
+ return r;
+ }
+ void dump(ceph::Formatter *f) const;
+};
+
+struct MaskedOption {
+ std::string raw_value; ///< raw, unparsed, unvalidated value
+ const Option *opt; ///< the option
+ OptionMask mask;
+ std::unique_ptr<const Option> unknown_opt; ///< if fabricated for an unknown option
+ std::string localized_name; ///< localized name for the option
+
+ MaskedOption(const Option *o, bool fab=false) : opt(o) {
+ if (fab) {
+ unknown_opt.reset(o);
+ }
+ }
+ MaskedOption(MaskedOption&& o) {
+ raw_value = std::move(o.raw_value);
+ opt = o.opt;
+ mask = std::move(o.mask);
+ unknown_opt = std::move(o.unknown_opt);
+ localized_name = std::move(o.localized_name);
+ }
+ const MaskedOption& operator=(const MaskedOption& o) = delete;
+ const MaskedOption& operator=(MaskedOption&& o) = delete;
+
+ /// return a precision metric (smaller is more precise)
+ int get_precision(const CrushWrapper *crush);
+
+ friend std::ostream& operator<<(std::ostream& out, const MaskedOption& o);
+
+ void dump(ceph::Formatter *f) const;
+};
+
+struct Section {
+ std::multimap<std::string,MaskedOption> options;
+
+ void clear() {
+ options.clear();
+ }
+ void dump(ceph::Formatter *f) const;
+ std::string get_minimal_conf() const;
+};
+
+struct ConfigMap {
+ struct ValueSource {
+ std::string section;
+ const MaskedOption *option = nullptr;
+ ValueSource() {}
+ ValueSource(const std::string& s, const MaskedOption *o)
+ : section(s), option(o) {}
+ };
+
+ Section global;
+ std::map<std::string,Section, std::less<>> by_type;
+ std::map<std::string,Section, std::less<>> by_id;
+ std::list<std::unique_ptr<Option>> stray_options;
+
+ Section *find_section(const std::string& name) {
+ if (name == "global") {
+ return &global;
+ }
+ auto i = by_type.find(name);
+ if (i != by_type.end()) {
+ return &i->second;
+ }
+ i = by_id.find(name);
+ if (i != by_id.end()) {
+ return &i->second;
+ }
+ return nullptr;
+ }
+ void clear() {
+ global.clear();
+ by_type.clear();
+ by_id.clear();
+ stray_options.clear();
+ }
+ void dump(ceph::Formatter *f) const;
+
+ std::map<std::string,std::string,std::less<>> generate_entity_map(
+ const EntityName& name,
+ const std::map<std::string,std::string>& crush_location,
+ const CrushWrapper *crush,
+ const std::string& device_class,
+ std::unordered_map<std::string,ValueSource> *src = nullptr);
+
+ void parse_key(
+ const std::string& key,
+ std::string *name,
+ std::string *who);
+ static bool parse_mask(
+ const std::string& in,
+ std::string *section,
+ OptionMask *mask);
+
+ int add_option(
+ CephContext *cct,
+ const std::string& name,
+ const std::string& who,
+ const std::string& value,
+ std::function<const Option *(const std::string&)> get_opt);
+};
+
+
+struct ConfigChangeSet {
+ version_t version;
+ utime_t stamp;
+ std::string name;
+
+ // key -> (old value, new value)
+ std::map<std::string,std::pair<std::optional<std::string>,std::optional<std::string>>> diff;
+
+ void dump(ceph::Formatter *f) const;
+ void print(std::ostream& out) const;
+};
diff --git a/src/mon/ConfigMonitor.cc b/src/mon/ConfigMonitor.cc
new file mode 100644
index 000000000..648067492
--- /dev/null
+++ b/src/mon/ConfigMonitor.cc
@@ -0,0 +1,981 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "mon/Monitor.h"
+#include "mon/ConfigMonitor.h"
+#include "mon/KVMonitor.h"
+#include "mon/MgrMonitor.h"
+#include "mon/OSDMonitor.h"
+#include "messages/MConfig.h"
+#include "messages/MGetConfig.h"
+#include "messages/MMonCommand.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "common/cmdparse.h"
+#include "include/stringify.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, this)
+using namespace TOPNSPC::common;
+
+using namespace std::literals;
+
+using std::cerr;
+using std::cout;
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::setfill;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+using std::unique_ptr;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+static ostream& _prefix(std::ostream *_dout, const Monitor &mon,
+ const ConfigMonitor *hmon) {
+ return *_dout << "mon." << mon.name << "@" << mon.rank
+ << "(" << mon.get_state_name() << ").config ";
+}
+
+const string KEY_PREFIX("config/");
+const string HISTORY_PREFIX("config-history/");
+
+ConfigMonitor::ConfigMonitor(Monitor &m, Paxos &p, const string& service_name)
+ : PaxosService(m, p, service_name) {
+}
+
+void ConfigMonitor::init()
+{
+ dout(10) << __func__ << dendl;
+}
+
+void ConfigMonitor::create_initial()
+{
+ dout(10) << __func__ << dendl;
+ version = 0;
+ create_pending();
+}
+
+void ConfigMonitor::update_from_paxos(bool *need_bootstrap)
+{
+ if (version == get_last_committed()) {
+ return;
+ }
+ version = get_last_committed();
+ dout(10) << __func__ << " " << version << dendl;
+ load_config();
+ check_all_subs();
+}
+
+void ConfigMonitor::create_pending()
+{
+ dout(10) << " " << version << dendl;
+ pending.clear();
+ pending_cleanup.clear();
+ pending_description.clear();
+}
+
+void ConfigMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+ dout(10) << " " << (version+1) << dendl;
+ put_last_committed(t, version+1);
+ // NOTE: caller should have done encode_pending_to_kvmon() and
+ // kvmon->propose_pending() to commit the actual config changes.
+}
+
+void ConfigMonitor::encode_pending_to_kvmon()
+{
+ // we need to pass our data through KVMonitor so that it is properly
+ // versioned and shared with subscribers.
+ for (auto& [key, value] : pending_cleanup) {
+ if (pending.count(key) == 0) {
+ derr << __func__ << " repair: adjusting config key '" << key << "'"
+ << dendl;
+ pending[key] = value;
+ }
+ }
+ pending_cleanup.clear();
+
+ // TODO: record changed sections (osd, mds.foo, rack:bar, ...)
+
+ string history = HISTORY_PREFIX + stringify(version+1) + "/";
+ {
+ bufferlist metabl;
+ ::encode(ceph_clock_now(), metabl);
+ ::encode(pending_description, metabl);
+ mon.kvmon()->enqueue_set(history, metabl);
+ }
+ for (auto& p : pending) {
+ string key = KEY_PREFIX + p.first;
+ auto q = current.find(p.first);
+ if (q != current.end()) {
+ if (p.second && *p.second == q->second) {
+ continue;
+ }
+ mon.kvmon()->enqueue_set(history + "-" + p.first, q->second);
+ } else if (!p.second) {
+ continue;
+ }
+ if (p.second) {
+ dout(20) << __func__ << " set " << key << dendl;
+ mon.kvmon()->enqueue_set(key, *p.second);
+ mon.kvmon()->enqueue_set(history + "+" + p.first, *p.second);
+ } else {
+ dout(20) << __func__ << " rm " << key << dendl;
+ mon.kvmon()->enqueue_rm(key);
+ }
+ }
+}
+
+version_t ConfigMonitor::get_trim_to() const
+{
+ // we don't actually need *any* old states, but keep a few.
+ if (version > 5) {
+ return version - 5;
+ }
+ return 0;
+}
+
+bool ConfigMonitor::preprocess_query(MonOpRequestRef op)
+{
+ switch (op->get_req()->get_type()) {
+ case MSG_MON_COMMAND:
+ try {
+ return preprocess_command(op);
+ } catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
+ }
+ return false;
+}
+
+bool ConfigMonitor::preprocess_command(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonCommand>();
+ std::stringstream ss;
+ int err = 0;
+
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, get_last_committed());
+ return true;
+ }
+ string format = cmd_getval_or<string>(cmdmap, "format", "plain");
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+ string prefix;
+ cmd_getval(cmdmap, "prefix", prefix);
+
+ bufferlist odata;
+ if (prefix == "config help") {
+ stringstream ss;
+ string name;
+ cmd_getval(cmdmap, "key", name);
+ name = ConfFile::normalize_key_name(name);
+ const Option *opt = g_conf().find_option(name);
+ if (!opt) {
+ opt = mon.mgrmon()->find_module_option(name);
+ }
+ if (opt) {
+ if (f) {
+ f->dump_object("option", *opt);
+ } else {
+ opt->print(&ss);
+ }
+ } else {
+ ss << "configuration option '" << name << "' not recognized";
+ err = -ENOENT;
+ goto reply;
+ }
+ if (f) {
+ f->flush(odata);
+ } else {
+ odata.append(ss.str());
+ }
+ } else if (prefix == "config ls") {
+ ostringstream ss;
+ if (f) {
+ f->open_array_section("options");
+ }
+ for (auto& i : ceph_options) {
+ if (f) {
+ f->dump_string("option", i.name);
+ } else {
+ ss << i.name << "\n";
+ }
+ }
+ for (auto& i : mon.mgrmon()->get_mgr_module_options()) {
+ if (f) {
+ f->dump_string("option", i.first);
+ } else {
+ ss << i.first << "\n";
+ }
+ }
+ if (f) {
+ f->close_section();
+ f->flush(odata);
+ } else {
+ odata.append(ss.str());
+ }
+ } else if (prefix == "config dump") {
+ list<pair<string,Section*>> sections = {
+ make_pair("global", &config_map.global)
+ };
+ for (string type : { "mon", "mgr", "osd", "mds", "client" }) {
+ auto i = config_map.by_type.find(type);
+ if (i != config_map.by_type.end()) {
+ sections.push_back(make_pair(i->first, &i->second));
+ }
+ auto j = config_map.by_id.lower_bound(type);
+ while (j != config_map.by_id.end() &&
+ j->first.find(type) == 0) {
+ sections.push_back(make_pair(j->first, &j->second));
+ ++j;
+ }
+ }
+ TextTable tbl;
+ if (!f) {
+ tbl.define_column("WHO", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("MASK", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("LEVEL", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("OPTION", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("VALUE", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("RO", TextTable::LEFT, TextTable::LEFT);
+ } else {
+ f->open_array_section("config");
+ }
+ for (auto& [sec_name, section] : sections) {
+ for (auto& [opt_name, masked_opt] : section->options) {
+ if (!f) {
+ tbl << sec_name;
+ tbl << masked_opt.mask.to_str();
+ tbl << Option::level_to_str(masked_opt.opt->level);
+ tbl << opt_name;
+ tbl << masked_opt.raw_value;
+ tbl << (masked_opt.opt->can_update_at_runtime() ? "" : "*");
+ tbl << TextTable::endrow;
+ } else {
+ f->open_object_section("option");
+ f->dump_string("section", sec_name);
+ masked_opt.dump(f.get());
+ f->close_section();
+ }
+ }
+ }
+ if (!f) {
+ odata.append(stringify(tbl));
+ } else {
+ f->close_section();
+ f->flush(odata);
+ }
+ } else if (prefix == "config get") {
+ string who, name;
+ cmd_getval(cmdmap, "who", who);
+
+ EntityName entity;
+ if (!entity.from_str(who) &&
+ !entity.from_str(who + ".")) {
+ ss << "unrecognized entity '" << who << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ map<string,string> crush_location;
+ string device_class;
+ if (entity.is_osd()) {
+ mon.osdmon()->osdmap.crush->get_full_location(who, &crush_location);
+ int id = atoi(entity.get_id().c_str());
+ const char *c = mon.osdmon()->osdmap.crush->get_item_class(id);
+ if (c) {
+ device_class = c;
+ }
+ dout(10) << __func__ << " crush_location " << crush_location
+ << " class " << device_class << dendl;
+ }
+
+ std::unordered_map<std::string,ConfigMap::ValueSource> src;
+ auto config = config_map.generate_entity_map(
+ entity,
+ crush_location,
+ mon.osdmon()->osdmap.crush.get(),
+ device_class,
+ &src);
+
+ if (cmd_getval(cmdmap, "key", name)) {
+ name = ConfFile::normalize_key_name(name);
+ const Option *opt = g_conf().find_option(name);
+ if (!opt) {
+ opt = mon.mgrmon()->find_module_option(name);
+ }
+ if (!opt) {
+ ss << "unrecognized key '" << name << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ if (opt->has_flag(Option::FLAG_NO_MON_UPDATE)) {
+ // handle special options
+ if (name == "fsid") {
+ odata.append(stringify(mon.monmap->get_fsid()));
+ odata.append("\n");
+ goto reply;
+ }
+ err = -EINVAL;
+ ss << name << " is special and cannot be stored by the mon";
+ goto reply;
+ }
+ // get a single value
+ auto p = config.find(name);
+ if (p != config.end()) {
+ odata.append(p->second);
+ odata.append("\n");
+ goto reply;
+ }
+ if (!entity.is_client() &&
+ opt->daemon_value != Option::value_t{}) {
+ odata.append(Option::to_str(opt->daemon_value));
+ } else {
+ odata.append(Option::to_str(opt->value));
+ }
+ odata.append("\n");
+ } else {
+ // dump all (non-default) values for this entity
+ TextTable tbl;
+ if (!f) {
+ tbl.define_column("WHO", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("MASK", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("LEVEL", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("OPTION", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("VALUE", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("RO", TextTable::LEFT, TextTable::LEFT);
+ } else {
+ f->open_object_section("config");
+ }
+ auto p = config.begin();
+ auto q = src.begin();
+ for (; p != config.end(); ++p, ++q) {
+ if (name.size() && p->first != name) {
+ continue;
+ }
+ if (!f) {
+ tbl << q->second.section;
+ tbl << q->second.option->mask.to_str();
+ tbl << Option::level_to_str(q->second.option->opt->level);
+ tbl << p->first;
+ tbl << p->second;
+ tbl << (q->second.option->opt->can_update_at_runtime() ? "" : "*");
+ tbl << TextTable::endrow;
+ } else {
+ f->open_object_section(p->first.c_str());
+ f->dump_string("value", p->second);
+ f->dump_string("section", q->second.section);
+ f->dump_object("mask", q->second.option->mask);
+ f->dump_bool("can_update_at_runtime",
+ q->second.option->opt->can_update_at_runtime());
+ f->close_section();
+ }
+ }
+ if (!f) {
+ odata.append(stringify(tbl));
+ } else {
+ f->close_section();
+ f->flush(odata);
+ }
+ }
+ } else if (prefix == "config log") {
+ int64_t num = 10;
+ cmd_getval(cmdmap, "num", num);
+ ostringstream ds;
+ if (f) {
+ f->open_array_section("changesets");
+ }
+ for (version_t v = version; v > version - std::min(version, (version_t)num); --v) {
+ ConfigChangeSet ch;
+ load_changeset(v, &ch);
+ if (f) {
+ f->dump_object("changeset", ch);
+ } else {
+ ch.print(ds);
+ }
+ }
+ if (f) {
+ f->close_section();
+ f->flush(odata);
+ } else {
+ odata.append(ds.str());
+ }
+ } else if (prefix == "config generate-minimal-conf") {
+ ostringstream conf;
+ conf << "# minimal ceph.conf for " << mon.monmap->get_fsid() << "\n";
+
+ // the basics
+ conf << "[global]\n";
+ conf << "\tfsid = " << mon.monmap->get_fsid() << "\n";
+ conf << "\tmon_host = ";
+ for (auto i = mon.monmap->mon_info.begin();
+ i != mon.monmap->mon_info.end();
+ ++i) {
+ if (i != mon.monmap->mon_info.begin()) {
+ conf << " ";
+ }
+ if (i->second.public_addrs.size() == 1 &&
+ i->second.public_addrs.front().is_legacy() &&
+ i->second.public_addrs.front().get_port() == CEPH_MON_PORT_LEGACY) {
+ // if this is a legacy addr on the legacy default port, then
+ // use the legacy-compatible formatting so that old clients
+ // can use this config. new code will see the :6789 and correctly
+ // interpret this as a v1 address.
+ conf << i->second.public_addrs.get_legacy_str();
+ } else {
+ conf << i->second.public_addrs;
+ }
+ }
+ conf << "\n";
+ conf << config_map.global.get_minimal_conf();
+ for (auto m : { &config_map.by_type, &config_map.by_id }) {
+ for (auto& i : *m) {
+ auto s = i.second.get_minimal_conf();
+ if (s.size()) {
+ conf << "\n[" << i.first << "]\n" << s;
+ }
+ }
+ }
+ odata.append(conf.str());
+ err = 0;
+ } else {
+ return false;
+ }
+
+ reply:
+ mon.reply_command(op, err, ss.str(), odata, get_last_committed());
+ return true;
+}
+
+void ConfigMonitor::handle_get_config(MonOpRequestRef op)
+{
+ auto m = op->get_req<MGetConfig>();
+ dout(10) << __func__ << " " << m->name << " host " << m->host << dendl;
+
+ const OSDMap& osdmap = mon.osdmon()->osdmap;
+ map<string,string> crush_location;
+ osdmap.crush->get_full_location(m->host, &crush_location);
+ auto out = config_map.generate_entity_map(
+ m->name,
+ crush_location,
+ osdmap.crush.get(),
+ m->device_class);
+ dout(20) << " config is " << out << dendl;
+ m->get_connection()->send_message(new MConfig{std::move(out)});
+}
+
+bool ConfigMonitor::prepare_update(MonOpRequestRef op)
+{
+ Message *m = op->get_req();
+ dout(7) << "prepare_update " << *m
+ << " from " << m->get_orig_source_inst() << dendl;
+ switch (m->get_type()) {
+ case MSG_MON_COMMAND:
+ try {
+ return prepare_command(op);
+ } catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
+ }
+ return false;
+}
+
+bool ConfigMonitor::prepare_command(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonCommand>();
+ std::stringstream ss;
+ int err = -EINVAL;
+
+ // make sure kv is writeable.
+ if (!mon.kvmon()->is_writeable()) {
+ dout(10) << __func__ << " waiting for kv mon to be writeable" << dendl;
+ mon.kvmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return false;
+ }
+
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, get_last_committed());
+ return true;
+ }
+
+ string prefix;
+ cmd_getval(cmdmap, "prefix", prefix);
+ bufferlist odata;
+
+ if (prefix == "config set" ||
+ prefix == "config rm") {
+ string who;
+ string name, value;
+ bool force = false;
+ cmd_getval(cmdmap, "who", who);
+ cmd_getval(cmdmap, "name", name);
+ cmd_getval(cmdmap, "value", value);
+ cmd_getval(cmdmap, "force", force);
+ name = ConfFile::normalize_key_name(name);
+
+ if (prefix == "config set" && !force) {
+ const Option *opt = g_conf().find_option(name);
+ if (!opt) {
+ opt = mon.mgrmon()->find_module_option(name);
+ }
+ if (!opt) {
+ ss << "unrecognized config option '" << name << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ Option::value_t real_value;
+ string errstr;
+ err = opt->parse_value(value, &real_value, &errstr, &value);
+ if (err < 0) {
+ ss << "error parsing value: " << errstr;
+ goto reply;
+ }
+
+ if (opt->has_flag(Option::FLAG_NO_MON_UPDATE)) {
+ err = -EINVAL;
+ ss << name << " is special and cannot be stored by the mon";
+ goto reply;
+ }
+ }
+
+ string section;
+ OptionMask mask;
+ if (!ConfigMap::parse_mask(who, &section, &mask)) {
+ ss << "unrecognized config target '" << who << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ string key;
+ if (section.size()) {
+ key += section + "/";
+ } else {
+ key += "global/";
+ }
+ string mask_str = mask.to_str();
+ if (mask_str.size()) {
+ key += mask_str + "/";
+ }
+ key += name;
+
+ if (prefix == "config set") {
+ bufferlist bl;
+ bl.append(value);
+ pending[key] = bl;
+ } else {
+ pending[key].reset();
+ }
+ goto update;
+ } else if (prefix == "config reset") {
+ int64_t revert_to = -1;
+ cmd_getval(cmdmap, "num", revert_to);
+ if (revert_to < 0 ||
+ revert_to > (int64_t)version) {
+ err = -EINVAL;
+ ss << "must specify a valid historical version to revert to; "
+ << "see 'ceph config log' for a list of avialable configuration "
+ << "historical versions";
+ goto reply;
+ }
+ if (revert_to == (int64_t)version) {
+ err = 0;
+ goto reply;
+ }
+ for (int64_t v = version; v > revert_to; --v) {
+ ConfigChangeSet ch;
+ load_changeset(v, &ch);
+ for (auto& i : ch.diff) {
+ if (i.second.first) {
+ bufferlist bl;
+ bl.append(*i.second.first);
+ pending[i.first] = bl;
+ } else if (i.second.second) {
+ pending[i.first].reset();
+ }
+ }
+ }
+ pending_description = string("reset to ") + stringify(revert_to);
+ goto update;
+ } else if (prefix == "config assimilate-conf") {
+ ConfFile cf;
+ bufferlist bl = m->get_data();
+ err = cf.parse_bufferlist(&bl, &ss);
+ if (err < 0) {
+ goto reply;
+ }
+ bool updated = false;
+ ostringstream newconf;
+ for (auto& [section, s] : cf) {
+ dout(20) << __func__ << " [" << section << "]" << dendl;
+ bool did_section = false;
+ for (auto& [key, val] : s) {
+ Option::value_t real_value;
+ string value;
+ string errstr;
+ if (key.empty()) {
+ continue;
+ }
+ // a known and worthy option?
+ const Option *o = g_conf().find_option(key);
+ if (!o) {
+ o = mon.mgrmon()->find_module_option(key);
+ }
+ if (!o ||
+ (o->flags & Option::FLAG_NO_MON_UPDATE) ||
+ (o->flags & Option::FLAG_CLUSTER_CREATE)) {
+ goto skip;
+ }
+ // normalize
+ err = o->parse_value(val, &real_value, &errstr, &value);
+ if (err < 0) {
+ dout(20) << __func__ << " failed to parse " << key << " = '"
+ << val << "'" << dendl;
+ goto skip;
+ }
+ // does it conflict with an existing value?
+ {
+ const Section *s = config_map.find_section(section);
+ if (s) {
+ auto k = s->options.find(key);
+ if (k != s->options.end()) {
+ if (value != k->second.raw_value) {
+ dout(20) << __func__ << " have " << key
+ << " = " << k->second.raw_value
+ << " (not " << value << ")" << dendl;
+ goto skip;
+ }
+ dout(20) << __func__ << " already have " << key
+ << " = " << k->second.raw_value << dendl;
+ continue;
+ }
+ }
+ }
+ dout(20) << __func__ << " add " << key << " = " << value
+ << " (" << val << ")" << dendl;
+ {
+ bufferlist bl;
+ bl.append(value);
+ pending[section + "/" + key] = bl;
+ updated = true;
+ }
+ continue;
+
+ skip:
+ dout(20) << __func__ << " skip " << key << " = " << value
+ << " (" << val << ")" << dendl;
+ if (!did_section) {
+ newconf << "\n[" << section << "]\n";
+ did_section = true;
+ }
+ newconf << "\t" << key << " = " << val << "\n";
+ }
+ }
+ odata.append(newconf.str());
+ if (updated) {
+ goto update;
+ }
+ } else {
+ ss << "unknown command " << prefix;
+ err = -EINVAL;
+ }
+
+reply:
+ mon.reply_command(op, err, ss.str(), odata, get_last_committed());
+ return false;
+
+update:
+ // see if there is an actual change
+ auto p = pending.begin();
+ while (p != pending.end()) {
+ auto q = current.find(p->first);
+ if (p->second && q != current.end() && *p->second == q->second) {
+ // set to same value
+ p = pending.erase(p);
+ } else if (!p->second && q == current.end()) {
+ // erasing non-existent value
+ p = pending.erase(p);
+ } else {
+ ++p;
+ }
+ }
+ if (pending.empty()) {
+ err = 0;
+ goto reply;
+ }
+ // immediately propose *with* KV mon
+ encode_pending_to_kvmon();
+ paxos.plug();
+ mon.kvmon()->propose_pending();
+ paxos.unplug();
+ force_immediate_propose();
+ wait_for_commit(
+ op,
+ new Monitor::C_Command(
+ mon, op, 0, ss.str(), odata,
+ get_last_committed() + 1));
+ return true;
+}
+
+void ConfigMonitor::tick()
+{
+ if (!is_active() || !mon.is_leader()) {
+ return;
+ }
+ dout(10) << __func__ << dendl;
+ bool changed = false;
+ if (!pending_cleanup.empty()) {
+ changed = true;
+ }
+ if (changed && mon.kvmon()->is_writeable()) {
+ paxos.plug();
+ encode_pending_to_kvmon();
+ mon.kvmon()->propose_pending();
+ paxos.unplug();
+ propose_pending();
+ }
+}
+
+void ConfigMonitor::on_active()
+{
+}
+
+void ConfigMonitor::load_config()
+{
+ std::map<std::string,std::string> renamed_pacific = {
+ { "mon_osd_blacklist_default_expire", "mon_osd_blocklist_default_expire" },
+ { "mon_mds_blacklist_interval", "mon_mds_blocklist_interval" },
+ { "mon_mgr_blacklist_interval", "mon_mgr_blocklist_interval" },
+ { "rbd_blacklist_on_break_lock", "rbd_blocklist_on_break_lock" },
+ { "rbd_blacklist_expire_seconds", "rbd_blocklist_expire_seconds" },
+ { "mds_session_blacklist_on_timeout", "mds_session_blocklist_on_timeout" },
+ { "mds_session_blacklist_on_evict", "mds_session_blocklist_on_evict" },
+ };
+
+ config_map.clear();
+ current.clear();
+
+ unsigned num = 0;
+ KeyValueDB::Iterator it = mon.store->get_iterator(KV_PREFIX);
+ for (it->lower_bound(KEY_PREFIX);
+ it->valid() &&
+ it->key().compare(0, KEY_PREFIX.size(), KEY_PREFIX) == 0;
+ it->next(), ++num) {
+ string key = it->key().substr(KEY_PREFIX.size());
+ string value = it->value().to_str();
+
+ current[key] = it->value();
+
+ string name;
+ string who;
+ config_map.parse_key(key, &name, &who);
+
+ // has this option been renamed?
+ {
+ auto p = renamed_pacific.find(name);
+ if (p != renamed_pacific.end()) {
+ if (mon.monmap->min_mon_release >= ceph_release_t::pacific) {
+ // schedule a cleanup
+ pending_cleanup[key].reset();
+ pending_cleanup[who + "/" + p->second] = it->value();
+ }
+ // continue loading under the new name
+ name = p->second;
+ }
+ }
+
+ int r = config_map.add_option(
+ g_ceph_context, name, who, value,
+ [&](const std::string& name) {
+ const Option *opt = g_conf().find_option(name);
+ if (!opt) {
+ opt = mon.mgrmon()->find_module_option(name);
+ }
+ return opt;
+ });
+ if (r == -EINVAL) {
+ dout(10) << __func__ << " will clean up key " << key << dendl;
+ pending_cleanup[key].reset();
+ }
+ }
+ dout(10) << __func__ << " got " << num << " keys" << dendl;
+
+ // refresh our own config
+ {
+ const OSDMap& osdmap = mon.osdmon()->osdmap;
+ map<string,string> crush_location;
+ osdmap.crush->get_full_location(g_conf()->host, &crush_location);
+ auto out = config_map.generate_entity_map(
+ g_conf()->name,
+ crush_location,
+ osdmap.crush.get(),
+ string{}); // no device class
+ g_conf().set_mon_vals(g_ceph_context, out, nullptr);
+ }
+}
+
+void ConfigMonitor::load_changeset(version_t v, ConfigChangeSet *ch)
+{
+ ch->version = v;
+ string prefix = HISTORY_PREFIX + stringify(v) + "/";
+ KeyValueDB::Iterator it = mon.store->get_iterator(KV_PREFIX);
+ it->lower_bound(prefix);
+ while (it->valid() && it->key().find(prefix) == 0) {
+ if (it->key() == prefix) {
+ bufferlist bl = it->value();
+ auto p = bl.cbegin();
+ try {
+ decode(ch->stamp, p);
+ decode(ch->name, p);
+ }
+ catch (ceph::buffer::error& e) {
+ derr << __func__ << " failure decoding changeset " << v << dendl;
+ }
+ } else {
+ char op = it->key()[prefix.length()];
+ string key = it->key().substr(prefix.length() + 1);
+ if (op == '-') {
+ ch->diff[key].first = it->value().to_str();
+ } else if (op == '+') {
+ ch->diff[key].second = it->value().to_str();
+ }
+ }
+ it->next();
+ }
+}
+
+bool ConfigMonitor::refresh_config(MonSession *s)
+{
+ const OSDMap& osdmap = mon.osdmon()->osdmap;
+ map<string,string> crush_location;
+ if (s->remote_host.size()) {
+ osdmap.crush->get_full_location(s->remote_host, &crush_location);
+ dout(10) << __func__ << " crush_location for remote_host " << s->remote_host
+ << " is " << crush_location << dendl;
+ }
+
+ string device_class;
+ if (s->name.is_osd()) {
+ osdmap.crush->get_full_location(s->entity_name.to_str(), &crush_location);
+ const char *c = osdmap.crush->get_item_class(s->name.num());
+ if (c) {
+ device_class = c;
+ dout(10) << __func__ << " device_class " << device_class << dendl;
+ }
+ }
+
+ dout(20) << __func__ << " " << s->entity_name << " crush " << crush_location
+ << " device_class " << device_class << dendl;
+ auto out = config_map.generate_entity_map(
+ s->entity_name,
+ crush_location,
+ osdmap.crush.get(),
+ device_class);
+
+ if (out == s->last_config && s->any_config) {
+ dout(20) << __func__ << " no change, " << out << dendl;
+ return false;
+ }
+ // removing this to hide sensitive data going into logs
+ // leaving this for debugging purposes
+ // dout(20) << __func__ << " " << out << dendl;
+ s->last_config = std::move(out);
+ s->any_config = true;
+ return true;
+}
+
+bool ConfigMonitor::maybe_send_config(MonSession *s)
+{
+ bool changed = refresh_config(s);
+ dout(10) << __func__ << " to " << s->name << " "
+ << (changed ? "(changed)" : "(unchanged)")
+ << dendl;
+ if (changed) {
+ send_config(s);
+ }
+ return changed;
+}
+
+void ConfigMonitor::send_config(MonSession *s)
+{
+ dout(10) << __func__ << " to " << s->name << dendl;
+ auto m = new MConfig(s->last_config);
+ s->con->send_message(m);
+}
+
+void ConfigMonitor::check_sub(MonSession *s)
+{
+ if (!s->authenticated) {
+ dout(20) << __func__ << " not authenticated " << s->entity_name << dendl;
+ return;
+ }
+ auto p = s->sub_map.find("config");
+ if (p != s->sub_map.end()) {
+ check_sub(p->second);
+ }
+}
+
+void ConfigMonitor::check_sub(Subscription *sub)
+{
+ dout(10) << __func__
+ << " next " << sub->next
+ << " have " << version << dendl;
+ if (sub->next <= version) {
+ maybe_send_config(sub->session);
+ if (sub->onetime) {
+ mon.with_session_map([sub](MonSessionMap& session_map) {
+ session_map.remove_sub(sub);
+ });
+ } else {
+ sub->next = version + 1;
+ }
+ }
+}
+
+void ConfigMonitor::check_all_subs()
+{
+ dout(10) << __func__ << dendl;
+ auto subs = mon.session_map.subs.find("config");
+ if (subs == mon.session_map.subs.end()) {
+ return;
+ }
+ int updated = 0, total = 0;
+ auto p = subs->second->begin();
+ while (!p.end()) {
+ auto sub = *p;
+ ++p;
+ ++total;
+ if (maybe_send_config(sub->session)) {
+ ++updated;
+ }
+ }
+ dout(10) << __func__ << " updated " << updated << " / " << total << dendl;
+}
diff --git a/src/mon/ConfigMonitor.h b/src/mon/ConfigMonitor.h
new file mode 100644
index 000000000..263bdb7a4
--- /dev/null
+++ b/src/mon/ConfigMonitor.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <optional>
+
+#include "ConfigMap.h"
+#include "mon/PaxosService.h"
+
+class MonSession;
+
+class ConfigMonitor : public PaxosService
+{
+ version_t version = 0;
+ ConfigMap config_map;
+ std::map<std::string,std::optional<ceph::buffer::list>> pending;
+ std::string pending_description;
+ std::map<std::string,std::optional<ceph::buffer::list>> pending_cleanup;
+
+ std::map<std::string,ceph::buffer::list> current;
+
+ void encode_pending_to_kvmon();
+
+public:
+ ConfigMonitor(Monitor &m, Paxos &p, const std::string& service_name);
+
+ void init() override;
+
+ void load_config();
+ void load_changeset(version_t v, ConfigChangeSet *ch);
+
+ bool preprocess_query(MonOpRequestRef op) override;
+ bool prepare_update(MonOpRequestRef op) override;
+
+ bool preprocess_command(MonOpRequestRef op);
+ bool prepare_command(MonOpRequestRef op);
+
+ void handle_get_config(MonOpRequestRef op);
+
+ void create_initial() override;
+ void update_from_paxos(bool *need_bootstrap) override;
+ void create_pending() override;
+ void encode_pending(MonitorDBStore::TransactionRef t) override;
+ version_t get_trim_to() const override;
+
+ void encode_full(MonitorDBStore::TransactionRef t) override { }
+
+ void on_active() override;
+ void tick() override;
+
+ bool refresh_config(MonSession *s);
+ bool maybe_send_config(MonSession *s);
+ void send_config(MonSession *s);
+ void check_sub(MonSession *s);
+ void check_sub(Subscription *sub);
+ void check_all_subs();
+};
diff --git a/src/mon/ConnectionTracker.cc b/src/mon/ConnectionTracker.cc
new file mode 100644
index 000000000..c87d614f6
--- /dev/null
+++ b/src/mon/ConnectionTracker.cc
@@ -0,0 +1,377 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "ConnectionTracker.h"
+#include "common/Formatter.h"
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, rank, epoch, version)
+
+static std::ostream& _prefix(std::ostream *_dout, int rank, epoch_t epoch, uint64_t version) {
+ return *_dout << "rank: " << rank << " version: "<< version << " ConnectionTracker(" << epoch << ") ";
+}
+
+std::ostream& operator<<(std::ostream&o, const ConnectionReport& c) {
+ o << "rank=" << c.rank << ",epoch=" << c.epoch << ",version=" << c.epoch_version
+ << ", current links: " << c.current << ", history: " << c.history;
+ return o;
+}
+
+std::ostream& operator<<(std::ostream& o, const ConnectionTracker& c) {
+ o << "rank=" << c.rank << ", epoch=" << c.epoch << ", version=" << c.version
+ << ", half_life=" << c.half_life << ", reports: " << c.peer_reports;
+ return o;
+}
+
+ConnectionReport *ConnectionTracker::reports(int p)
+{
+ auto i = peer_reports.find(p);
+ if (i == peer_reports.end()) {
+ ceph_assert(p != rank);
+ auto[j,k] = peer_reports.insert(std::pair<int,ConnectionReport>(p,ConnectionReport()));
+ i = j;
+ }
+ return &i->second;
+}
+
+const ConnectionReport *ConnectionTracker::reports(int p) const
+{
+ auto i = peer_reports.find(p);
+ if (i == peer_reports.end()) {
+ return NULL;
+ }
+ return &i->second;
+}
+
+void ConnectionTracker::receive_peer_report(const ConnectionTracker& o)
+{
+ ldout(cct, 30) << __func__ << dendl;
+ for (auto& i : o.peer_reports) {
+ const ConnectionReport& report = i.second;
+ if (i.first == rank || i.first < 0) {
+ continue;
+ }
+ ConnectionReport& existing = *reports(i.first);
+ if (report.epoch > existing.epoch ||
+ (report.epoch == existing.epoch &&
+ report.epoch_version > existing.epoch_version)) {
+ ldout(cct, 30) << " new peer_report is more updated" << dendl;
+ ldout(cct, 30) << "existing: " << existing << dendl;
+ ldout(cct, 30) << "new: " << report << dendl;
+ existing = report;
+ }
+ }
+ encoding.clear();
+}
+
+bool ConnectionTracker::increase_epoch(epoch_t e)
+{
+ ldout(cct, 30) << __func__ << " to " << e << dendl;
+ if (e > epoch && rank >= 0) {
+ my_reports.epoch_version = version = 0;
+ my_reports.epoch = epoch = e;
+ peer_reports[rank] = my_reports;
+ encoding.clear();
+ return true;
+ }
+ ldout(cct, 10) << "Either got a report from a rank -1 or our epoch is >= to "
+ << e << " not increasing our epoch!" << dendl;
+ return false;
+}
+
+void ConnectionTracker::increase_version()
+{
+ ldout(cct, 30) << __func__ << " to " << version+1 << dendl;
+ if (rank >= 0) {
+ encoding.clear();
+ ++version;
+ my_reports.epoch_version = version;
+ peer_reports[rank] = my_reports;
+ if ((version % persist_interval) == 0 ) {
+ ldout(cct, 30) << version << " % " << persist_interval << " == 0" << dendl;
+ owner->persist_connectivity_scores();
+ }
+ } else {
+ ldout(cct, 10) << "Got a report from a rank -1, not increasing our version!" << dendl;
+ }
+}
+
+void ConnectionTracker::report_live_connection(int peer_rank, double units_alive)
+{
+ ldout(cct, 30) << __func__ << " peer_rank: " << peer_rank << " units_alive: " << units_alive << dendl;
+ ldout(cct, 30) << "my_reports before: " << my_reports << dendl;
+ if (peer_rank == rank) {
+ lderr(cct) << "Got a report from my own rank, hopefully this is startup weirdness, dropping" << dendl;
+ return;
+ }
+ if (peer_rank < 0) {
+ ldout(cct, 10) << "Got a report from a rank -1, not adding that to our report!" << dendl;
+ return;
+ }
+ // we need to "auto-initialize" to 1, do shenanigans
+ auto i = my_reports.history.find(peer_rank);
+ if (i == my_reports.history.end()) {
+ ldout(cct, 30) << "couldn't find: " << peer_rank
+ << " in my_reports.history" << "... inserting: "
+ << "(" << peer_rank << ", 1" << dendl;
+ auto[j,k] = my_reports.history.insert(std::pair<int,double>(peer_rank,1.0));
+ i = j;
+ }
+ double& pscore = i->second;
+ ldout(cct, 30) << "adding new pscore to my_reports" << dendl;
+ pscore = pscore * (1 - units_alive / (2 * half_life)) +
+ (units_alive / (2 * half_life));
+ pscore = std::min(pscore, 1.0);
+ my_reports.current[peer_rank] = true;
+
+ increase_version();
+ ldout(cct, 30) << "my_reports after: " << my_reports << dendl;
+}
+
+void ConnectionTracker::report_dead_connection(int peer_rank, double units_dead)
+{
+ ldout(cct, 30) << __func__ << " peer_rank: " << peer_rank << " units_dead: " << units_dead << dendl;
+ ldout(cct, 30) << "my_reports before: " << my_reports << dendl;
+ if (peer_rank == rank) {
+ lderr(cct) << "Got a report from my own rank, hopefully this is startup weirdness, dropping" << dendl;
+ return;
+ }
+ if (peer_rank < 0) {
+ ldout(cct, 10) << "Got a report from a rank -1, not adding that to our report!" << dendl;
+ return;
+ }
+ // we need to "auto-initialize" to 1, do shenanigans
+ auto i = my_reports.history.find(peer_rank);
+ if (i == my_reports.history.end()) {
+ ldout(cct, 30) << "couldn't find: " << peer_rank
+ << " in my_reports.history" << "... inserting: "
+ << "(" << peer_rank << ", 1" << dendl;
+ auto[j,k] = my_reports.history.insert(std::pair<int,double>(peer_rank,1.0));
+ i = j;
+ }
+ double& pscore = i->second;
+ ldout(cct, 30) << "adding new pscore to my_reports" << dendl;
+ pscore = pscore * (1 - units_dead / (2 * half_life)) -
+ (units_dead / (2*half_life));
+ pscore = std::max(pscore, 0.0);
+ my_reports.current[peer_rank] = false;
+
+ increase_version();
+ ldout(cct, 30) << "my_reports after: " << my_reports << dendl;
+}
+
+void ConnectionTracker::get_total_connection_score(int peer_rank, double *rating,
+ int *live_count) const
+{
+ ldout(cct, 30) << __func__ << dendl;
+ *rating = 0;
+ *live_count = 0;
+ double rate = 0;
+ int live = 0;
+
+ for (const auto& i : peer_reports) { // loop through all the scores
+ if (i.first == peer_rank) { // ... except the ones it has for itself, of course!
+ continue;
+ }
+ const auto& report = i.second;
+ auto score_i = report.history.find(peer_rank);
+ auto live_i = report.current.find(peer_rank);
+ if (score_i != report.history.end()) {
+ if (live_i->second) {
+ rate += score_i->second;
+ ++live;
+ }
+ }
+ }
+ *rating = rate;
+ *live_count = live;
+}
+
+void ConnectionTracker::notify_rank_changed(int new_rank)
+{
+ ldout(cct, 20) << __func__ << " to " << new_rank << dendl;
+ if (new_rank == rank) return;
+ ldout(cct, 20) << "peer_reports before: " << peer_reports << dendl;
+ peer_reports.erase(rank);
+ peer_reports.erase(new_rank);
+ my_reports.rank = new_rank;
+ rank = new_rank;
+ encoding.clear();
+ ldout(cct, 20) << "peer_reports after: " << peer_reports << dendl;
+
+ increase_version();
+}
+
+void ConnectionTracker::notify_rank_removed(int rank_removed, int new_rank)
+{
+ ldout(cct, 20) << __func__ << " " << rank_removed
+ << " new_rank: " << new_rank << dendl;
+ ldout(cct, 20) << "my_reports before: " << my_reports << dendl;
+ ldout(cct, 20) << "peer_reports before: " << peer_reports << dendl;
+ ldout(cct, 20) << "my rank before: " << rank << dendl;
+
+ encoding.clear();
+ size_t starting_size_current = my_reports.current.size();
+ // Lets adjust everything in my report.
+ my_reports.current.erase(rank_removed);
+ my_reports.history.erase(rank_removed);
+ auto ci = my_reports.current.upper_bound(rank_removed);
+ auto hi = my_reports.history.upper_bound(rank_removed);
+ while (ci != my_reports.current.end()) {
+ ceph_assert(ci->first == hi->first);
+ my_reports.current[ci->first - 1] = ci->second;
+ my_reports.history[hi->first - 1] = hi->second;
+ my_reports.current.erase(ci++);
+ my_reports.history.erase(hi++);
+ }
+ ceph_assert((my_reports.current.size() == starting_size_current) ||
+ (my_reports.current.size() + 1 == starting_size_current));
+
+ size_t starting_size = peer_reports.size();
+ auto pi = peer_reports.upper_bound(rank_removed);
+ // Remove the target rank and adjust everything that comes after.
+ // Note that we don't adjust current and history for our peer_reports
+ // because it is better to rely on our peers on that information.
+ peer_reports.erase(rank_removed);
+ while (pi != peer_reports.end()) {
+ peer_reports[pi->first - 1] = pi->second; // copy content of next rank to ourself.
+ peer_reports.erase(pi++); // destroy our next rank and move on.
+ }
+
+ ceph_assert((peer_reports.size() == starting_size) ||
+ (peer_reports.size() + 1 == starting_size));
+
+ if (rank_removed < rank) { // if the rank removed is lower than us, we need to adjust.
+ --rank;
+ my_reports.rank = rank; // also adjust my_reports.rank.
+ }
+
+ ldout(cct, 20) << "my rank after: " << rank << dendl;
+ ldout(cct, 20) << "peer_reports after: " << peer_reports << dendl;
+ ldout(cct, 20) << "my_reports after: " << my_reports << dendl;
+
+ //check if the new_rank from monmap is equal to our adjusted rank.
+ ceph_assert(rank == new_rank);
+
+ increase_version();
+}
+
+bool ConnectionTracker::is_clean(int mon_rank, int monmap_size)
+{
+ ldout(cct, 30) << __func__ << dendl;
+ // check consistency between our rank according
+ // to monmap and our rank according to our report.
+ if (rank != mon_rank ||
+ my_reports.rank != mon_rank) {
+ return false;
+ } else if (!peer_reports.empty()){
+ // if peer_report max rank is greater than monmap max rank
+ // then there is a problem.
+ if (peer_reports.rbegin()->first > monmap_size - 1) return false;
+ }
+ return true;
+}
+
+void ConnectionTracker::encode(bufferlist &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(rank, bl);
+ encode(epoch, bl);
+ encode(version, bl);
+ encode(half_life, bl);
+ encode(peer_reports, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ConnectionTracker::decode(bufferlist::const_iterator& bl) {
+ clear_peer_reports();
+ encoding.clear();
+
+ DECODE_START(1, bl);
+ decode(rank, bl);
+ decode(epoch, bl);
+ decode(version, bl);
+ decode(half_life, bl);
+ decode(peer_reports, bl);
+ DECODE_FINISH(bl);
+ if (rank >=0)
+ my_reports = peer_reports[rank];
+}
+
+const bufferlist& ConnectionTracker::get_encoded_bl()
+{
+ if (!encoding.length()) {
+ encode(encoding);
+ }
+ return encoding;
+}
+
+void ConnectionReport::dump(ceph::Formatter *f) const
+{
+ f->dump_int("rank", rank);
+ f->dump_int("epoch", epoch);
+ f->dump_int("version", epoch_version);
+ f->open_object_section("peer_scores");
+ for (auto i : history) {
+ f->open_object_section("peer");
+ f->dump_int("peer_rank", i.first);
+ f->dump_float("peer_score", i.second);
+ f->dump_bool("peer_alive", current.find(i.first)->second);
+ f->close_section();
+ }
+ f->close_section(); // peer scores
+}
+
+void ConnectionReport::generate_test_instances(std::list<ConnectionReport*>& o)
+{
+ o.push_back(new ConnectionReport);
+ o.push_back(new ConnectionReport);
+ o.back()->rank = 1;
+ o.back()->epoch = 2;
+ o.back()->epoch_version = 3;
+ o.back()->current[0] = true;
+ o.back()->history[0] = .4;
+}
+
+void ConnectionTracker::dump(ceph::Formatter *f) const
+{
+ f->dump_int("rank", rank);
+ f->dump_int("epoch", epoch);
+ f->dump_int("version", version);
+ f->dump_float("half_life", half_life);
+ f->dump_int("persist_interval", persist_interval);
+ f->open_object_section("reports");
+ for (const auto& i : peer_reports) {
+ f->open_object_section("report");
+ i.second.dump(f);
+ f->close_section();
+ }
+ f->close_section(); // reports
+}
+
+void ConnectionTracker::generate_test_instances(std::list<ConnectionTracker*>& o)
+{
+ o.push_back(new ConnectionTracker);
+ o.push_back(new ConnectionTracker);
+ ConnectionTracker *e = o.back();
+ e->rank = 2;
+ e->epoch = 3;
+ e->version = 4;
+ e->peer_reports[0];
+ e->peer_reports[1];
+ e->my_reports = e->peer_reports[2];
+}
diff --git a/src/mon/ConnectionTracker.h b/src/mon/ConnectionTracker.h
new file mode 100644
index 000000000..c1a32c08f
--- /dev/null
+++ b/src/mon/ConnectionTracker.h
@@ -0,0 +1,205 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+#include "include/types.h"
+
+struct ConnectionReport {
+ int rank = -1; // mon rank this state belongs to
+ std::map<int, bool> current; // true if connected to the other mon
+ std::map<int, double> history; // [0-1]; the connection reliability
+ epoch_t epoch = 0; // the (local) election epoch the ConnectionReport came from
+ uint64_t epoch_version = 0; // version of the ConnectionReport within the epoch
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(rank, bl);
+ encode(current, bl);
+ encode(history, bl);
+ encode(epoch, bl);
+ encode(epoch_version, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(rank, bl);
+ decode(current, bl);
+ decode(history, bl);
+ decode(epoch, bl);
+ decode(epoch_version, bl);
+ DECODE_FINISH(bl);
+ }
+ bool operator==(const ConnectionReport& o) const {
+ return o.rank == rank && o.current == current &&
+ o.history == history && o.epoch == epoch &&
+ o.epoch_version == epoch_version;
+ }
+ friend std::ostream& operator<<(std::ostream&o, const ConnectionReport& c);
+
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<ConnectionReport*>& o);
+};
+WRITE_CLASS_ENCODER(ConnectionReport);
+
+class RankProvider {
+ public:
+ /**
+ * Get the rank of the running daemon.
+ * It can be -1, meaning unknown/invalid, or it
+ * can be >1.
+ * You should not invoke the function get_total_connection_score()
+ * with an unknown rank.
+ */
+ virtual int get_my_rank() const = 0;
+ /**
+ * Asks our owner to encode us and persist it to disk.
+ * Presently we do this every tenth update.
+ */
+ virtual void persist_connectivity_scores() = 0;
+ virtual ~RankProvider() {}
+};
+
+class ConnectionTracker {
+ public:
+ /**
+ * Receive a report from a peer and update our internal state
+ * if the peer has newer data.
+ */
+ void receive_peer_report(const ConnectionTracker& o);
+ /**
+ * Bump up the epoch to the specified number.
+ * Validates that it is > current epoch and resets
+ * version to 0; returns false if not.
+ */
+ bool increase_epoch(epoch_t e);
+ /**
+ * Bump up the version within our epoch.
+ * If the new version is a multiple of ten, we also persist it.
+ */
+ void increase_version();
+
+ /**
+ * Report a connection to a peer rank has been considered alive for
+ * the given time duration. We assume the units_alive is <= the time
+ * since the previous reporting call.
+ * (Or, more precisely, we assume that the total amount of time
+ * passed in is less than or equal to the time which has actually
+ * passed -- you can report a 10-second death immediately followed
+ * by reporting 5 seconds of liveness if your metrics are delayed.)
+ */
+ void report_live_connection(int peer_rank, double units_alive);
+ /**
+ * Report a connection to a peer rank has been considered dead for
+ * the given time duration, analogous to that above.
+ */
+ void report_dead_connection(int peer_rank, double units_dead);
+ /**
+ * Set the half-life for dropping connection state
+ * out of the ongoing score.
+ * Whenever you add a new data point:
+ * new_score = old_score * ( 1 - units / (2d)) + (units/(2d))
+ * where units is the units reported alive (for dead, you subtract them).
+ */
+ void set_half_life(double d) {
+ half_life = d;
+ }
+ /**
+ * Get the total connection score of a rank across
+ * all peers, and the count of how many electors think it's alive.
+ * For this summation, if a rank reports a peer as down its score is zero.
+ */
+ void get_total_connection_score(int peer_rank, double *rating,
+ int *live_count) const;
+ /**
+ * Check if our ranks are clean and make
+ * sure there are no extra peer_report lingering.
+ * In the future we also want to check the reports
+ * current and history of each peer_report.
+ */
+ bool is_clean(int mon_rank, int monmap_size);
+ /**
+ * Encode this ConnectionTracker. Useful both for storing on disk
+ * and for sending off to peers for decoding and import
+ * with receive_peer_report() above.
+ */
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator& bl);
+ /**
+ * Get a bufferlist containing the ConnectionTracker.
+ * This is like encode() but holds a copy so it
+ * doesn't re-encode on every invocation.
+ */
+ const bufferlist& get_encoded_bl();
+ private:
+ epoch_t epoch;
+ uint64_t version;
+ std::map<int,ConnectionReport> peer_reports;
+ ConnectionReport my_reports;
+ double half_life;
+ RankProvider *owner;
+ int rank;
+ int persist_interval;
+ bufferlist encoding;
+ CephContext *cct;
+ int get_my_rank() const { return rank; }
+ ConnectionReport *reports(int p);
+ const ConnectionReport *reports(int p) const;
+
+ void clear_peer_reports() {
+ encoding.clear();
+ peer_reports.clear();
+ my_reports = ConnectionReport();
+ my_reports.rank = rank;
+ }
+
+ public:
+ ConnectionTracker() : epoch(0), version(0), half_life(12*60*60),
+ owner(NULL), rank(-1), persist_interval(10) {
+ }
+ ConnectionTracker(RankProvider *o, int rank, double hl,
+ int persist_i, CephContext *c) :
+ epoch(0), version(0),
+ half_life(hl), owner(o), rank(rank), persist_interval(persist_i), cct(c) {
+ my_reports.rank = rank;
+ }
+ ConnectionTracker(const bufferlist& bl, CephContext *c) :
+ epoch(0), version(0),
+ half_life(0), owner(NULL), rank(-1), persist_interval(10), cct(c)
+ {
+ auto bi = bl.cbegin();
+ decode(bi);
+ }
+ ConnectionTracker(const ConnectionTracker& o) :
+ epoch(o.epoch), version(o.version),
+ half_life(o.half_life), owner(o.owner), rank(o.rank),
+ persist_interval(o.persist_interval), cct(o.cct)
+ {
+ peer_reports = o.peer_reports;
+ my_reports = o.my_reports;
+ }
+ void notify_reset() { clear_peer_reports(); }
+ void set_rank(int new_rank) {
+ rank = new_rank;
+ my_reports.rank = rank;
+ }
+
+ void notify_rank_changed(int new_rank);
+ void notify_rank_removed(int rank_removed, int new_rank);
+ friend std::ostream& operator<<(std::ostream& o, const ConnectionTracker& c);
+ friend ConnectionReport *get_connection_reports(ConnectionTracker& ct);
+ friend std::map<int,ConnectionReport> *get_peer_reports(ConnectionTracker& ct);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<ConnectionTracker*>& o);
+};
+
+WRITE_CLASS_ENCODER(ConnectionTracker);
diff --git a/src/mon/CreatingPGs.h b/src/mon/CreatingPGs.h
new file mode 100644
index 000000000..808a22b15
--- /dev/null
+++ b/src/mon/CreatingPGs.h
@@ -0,0 +1,246 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "include/encoding.h"
+#include "include/utime.h"
+
+#include "osd/osd_types.h"
+
+struct creating_pgs_t {
+ epoch_t last_scan_epoch = 0;
+
+ struct pg_create_info {
+ epoch_t create_epoch;
+ utime_t create_stamp;
+
+ // NOTE: pre-octopus instances of this class will have a
+ // zeroed-out history
+ std::vector<int> up;
+ int up_primary = -1;
+ std::vector<int> acting;
+ int acting_primary = -1;
+ pg_history_t history;
+ PastIntervals past_intervals;
+
+ void encode(ceph::buffer::list& bl, uint64_t features) const {
+ using ceph::encode;
+ if (!HAVE_FEATURE(features, SERVER_OCTOPUS)) {
+ // was pair<epoch_t,utime_t> prior to octopus
+ encode(create_epoch, bl);
+ encode(create_stamp, bl);
+ return;
+ }
+ ENCODE_START(1, 1, bl);
+ encode(create_epoch, bl);
+ encode(create_stamp, bl);
+ encode(up, bl);
+ encode(up_primary, bl);
+ encode(acting, bl);
+ encode(acting_primary, bl);
+ encode(history, bl);
+ encode(past_intervals, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode_legacy(ceph::buffer::list::const_iterator& p) {
+ using ceph::decode;
+ decode(create_epoch, p);
+ decode(create_stamp, p);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ using ceph::decode;
+ DECODE_START(1, p);
+ decode(create_epoch, p);
+ decode(create_stamp, p);
+ decode(up, p);
+ decode(up_primary, p);
+ decode(acting, p);
+ decode(acting_primary, p);
+ decode(history, p);
+ decode(past_intervals, p);
+ DECODE_FINISH(p);
+ }
+ void dump(ceph::Formatter *f) const {
+ f->dump_unsigned("create_epoch", create_epoch);
+ f->dump_stream("create_stamp") << create_stamp;
+ f->open_array_section("up");
+ for (auto& i : up) {
+ f->dump_unsigned("osd", i);
+ }
+ f->close_section();
+ f->dump_int("up_primary", up_primary);
+ f->open_array_section("acting");
+ for (auto& i : acting) {
+ f->dump_unsigned("osd", i);
+ }
+ f->close_section();
+ f->dump_int("acting_primary", up_primary);
+ f->dump_object("pg_history", history);
+ f->dump_object("past_intervals", past_intervals);
+ }
+ static void generate_test_instances(std::list<pg_create_info*>& o) {
+ o.push_back(new pg_create_info);
+ o.back()->create_epoch = 10;
+ o.push_back(new pg_create_info);
+ o.back()->create_epoch = 1;
+ o.back()->create_stamp = utime_t(2, 3);
+ o.back()->up = {1, 2, 3};
+ o.back()->up_primary = 1;
+ o.back()->acting = {1, 2, 3};
+ o.back()->acting_primary = 1;
+ }
+
+ pg_create_info()
+ : create_epoch(0) {}
+ pg_create_info(epoch_t e, utime_t t)
+ : create_epoch(e),
+ create_stamp(t) {
+ // NOTE: we don't initialize the other fields here; see
+ // OSDMonitor::update_pending_pgs()
+ }
+ };
+
+ /// pgs we are currently creating
+ std::map<pg_t, pg_create_info> pgs;
+
+ struct pool_create_info {
+ epoch_t created;
+ utime_t modified;
+ uint64_t start = 0;
+ uint64_t end = 0;
+ bool done() const {
+ return start >= end;
+ }
+ void encode(ceph::buffer::list& bl) const {
+ using ceph::encode;
+ encode(created, bl);
+ encode(modified, bl);
+ encode(start, bl);
+ encode(end, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ using ceph::decode;
+ decode(created, p);
+ decode(modified, p);
+ decode(start, p);
+ decode(end, p);
+ }
+ };
+
+ /// queue of pgs we still need to create (poolid -> <created, set of ps>)
+ std::map<int64_t,pool_create_info> queue;
+
+ /// pools that exist in the osdmap for which at least one pg has been created
+ std::set<int64_t> created_pools;
+
+ bool still_creating_pool(int64_t poolid) {
+ for (auto& i : pgs) {
+ if (i.first.pool() == poolid) {
+ return true;
+ }
+ }
+ if (queue.count(poolid)) {
+ return true;
+ }
+ return false;
+ }
+ void create_pool(int64_t poolid, uint32_t pg_num,
+ epoch_t created, utime_t modified) {
+ ceph_assert(created_pools.count(poolid) == 0);
+ auto& c = queue[poolid];
+ c.created = created;
+ c.modified = modified;
+ c.end = pg_num;
+ created_pools.insert(poolid);
+ }
+ unsigned remove_pool(int64_t removed_pool) {
+ const unsigned total = pgs.size();
+ auto first = pgs.lower_bound(pg_t{0, (uint64_t)removed_pool});
+ auto last = pgs.lower_bound(pg_t{0, (uint64_t)removed_pool + 1});
+ pgs.erase(first, last);
+ created_pools.erase(removed_pool);
+ queue.erase(removed_pool);
+ return total - pgs.size();
+ }
+ void encode(ceph::buffer::list& bl, uint64_t features) const {
+ unsigned v = 3;
+ if (!HAVE_FEATURE(features, SERVER_OCTOPUS)) {
+ v = 2;
+ }
+ ENCODE_START(v, 1, bl);
+ encode(last_scan_epoch, bl);
+ encode(pgs, bl, features);
+ encode(created_pools, bl);
+ encode(queue, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ DECODE_START(3, bl);
+ decode(last_scan_epoch, bl);
+ if (struct_v >= 3) {
+ decode(pgs, bl);
+ } else {
+ // legacy pg encoding
+ pgs.clear();
+ uint32_t num;
+ decode(num, bl);
+ while (num--) {
+ pg_t pgid;
+ decode(pgid, bl);
+ pgs[pgid].decode_legacy(bl);
+ }
+ }
+ decode(created_pools, bl);
+ if (struct_v >= 2)
+ decode(queue, bl);
+ DECODE_FINISH(bl);
+ }
+ void dump(ceph::Formatter *f) const {
+ f->dump_unsigned("last_scan_epoch", last_scan_epoch);
+ f->open_array_section("creating_pgs");
+ for (auto& pg : pgs) {
+ f->open_object_section("pg");
+ f->dump_stream("pgid") << pg.first;
+ f->dump_object("pg_create_info", pg.second);
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("queue");
+ for (auto& p : queue) {
+ f->open_object_section("pool");
+ f->dump_unsigned("pool", p.first);
+ f->dump_unsigned("created", p.second.created);
+ f->dump_stream("modified") << p.second.modified;
+ f->dump_unsigned("ps_start", p.second.start);
+ f->dump_unsigned("ps_end", p.second.end);
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("created_pools");
+ for (auto pool : created_pools) {
+ f->dump_unsigned("pool", pool);
+ }
+ f->close_section();
+ }
+ static void generate_test_instances(std::list<creating_pgs_t*>& o) {
+ auto c = new creating_pgs_t;
+ c->last_scan_epoch = 17;
+ c->pgs.emplace(pg_t{42, 2}, pg_create_info(31, utime_t{891, 113}));
+ c->pgs.emplace(pg_t{44, 2}, pg_create_info(31, utime_t{891, 113}));
+ c->created_pools = {0, 1};
+ o.push_back(c);
+ c = new creating_pgs_t;
+ c->last_scan_epoch = 18;
+ c->pgs.emplace(pg_t{42, 3}, pg_create_info(31, utime_t{891, 113}));
+ c->created_pools = {};
+ o.push_back(c);
+ }
+};
+WRITE_CLASS_ENCODER_FEATURES(creating_pgs_t::pg_create_info)
+WRITE_CLASS_ENCODER(creating_pgs_t::pool_create_info)
+WRITE_CLASS_ENCODER_FEATURES(creating_pgs_t)
diff --git a/src/mon/ElectionLogic.cc b/src/mon/ElectionLogic.cc
new file mode 100644
index 000000000..0c1b30c41
--- /dev/null
+++ b/src/mon/ElectionLogic.cc
@@ -0,0 +1,557 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "ElectionLogic.h"
+
+#include "include/ceph_assert.h"
+#include "common/dout.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, epoch, elector)
+using std::cerr;
+using std::cout;
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::setfill;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+using std::unique_ptr;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+static ostream& _prefix(std::ostream *_dout, epoch_t epoch, ElectionOwner* elector) {
+ return *_dout << "paxos." << elector->get_my_rank()
+ << ").electionLogic(" << epoch << ") ";
+}
+void ElectionLogic::init()
+{
+ epoch = elector->read_persisted_epoch();
+ if (!epoch) {
+ ldout(cct, 1) << "init, first boot, initializing epoch at 1 " << dendl;
+ epoch = 1;
+ } else if (epoch % 2) {
+ ldout(cct, 1) << "init, last seen epoch " << epoch
+ << ", mid-election, bumping" << dendl;
+ ++epoch;
+ elector->persist_epoch(epoch);
+ } else {
+ ldout(cct, 1) << "init, last seen epoch " << epoch << dendl;
+ }
+}
+
+void ElectionLogic::bump_epoch(epoch_t e)
+{
+ ldout(cct, 10) << __func__ << " to " << e << dendl;
+ ceph_assert(epoch <= e);
+ epoch = e;
+ peer_tracker->increase_epoch(e);
+ elector->persist_epoch(epoch);
+ // clear up some state
+ electing_me = false;
+ acked_me.clear();
+ elector->notify_bump_epoch();
+}
+
+void ElectionLogic::declare_standalone_victory()
+{
+ assert(elector->paxos_size() == 1 && elector->get_my_rank() == 0);
+ init();
+ bump_epoch(epoch+1);
+}
+
+void ElectionLogic::clear_live_election_state()
+{
+ leader_acked = -1;
+ electing_me = false;
+ reset_stable_tracker();
+ leader_peer_tracker.reset();
+}
+
+void ElectionLogic::reset_stable_tracker()
+{
+ stable_peer_tracker.reset(new ConnectionTracker(*peer_tracker));
+}
+
+void ElectionLogic::connectivity_bump_epoch_in_election(epoch_t mepoch)
+{
+ ldout(cct, 30) << __func__ << " to " << mepoch << dendl;
+ ceph_assert(mepoch > epoch);
+ bump_epoch(mepoch);
+ reset_stable_tracker();
+ double lscore, my_score;
+ my_score = connectivity_election_score(elector->get_my_rank());
+ lscore = connectivity_election_score(leader_acked);
+ if (my_score > lscore) {
+ leader_acked = -1;
+ leader_peer_tracker.reset();
+ }
+}
+
+void ElectionLogic::start()
+{
+ if (!participating) {
+ ldout(cct, 0) << "not starting new election -- not participating" << dendl;
+ return;
+ }
+ ldout(cct, 5) << "start -- can i be leader?" << dendl;
+
+ acked_me.clear();
+ init();
+
+ // start by trying to elect me
+ if (epoch % 2 == 0) {
+ bump_epoch(epoch+1); // odd == election cycle
+ } else {
+ elector->validate_store();
+ }
+ acked_me.insert(elector->get_my_rank());
+ clear_live_election_state();
+ reset_stable_tracker();
+ electing_me = true;
+
+ bufferlist bl;
+ if (strategy == CONNECTIVITY) {
+ stable_peer_tracker->encode(bl);
+ }
+ elector->propose_to_peers(epoch, bl);
+ elector->_start();
+}
+
+void ElectionLogic::defer(int who)
+{
+ if (strategy == CLASSIC) {
+ ldout(cct, 5) << "defer to " << who << dendl;
+ ceph_assert(who < elector->get_my_rank());
+ } else {
+ ldout(cct, 5) << "defer to " << who << ", disallowed_leaders=" << elector->get_disallowed_leaders() << dendl;
+ ceph_assert(!elector->get_disallowed_leaders().count(who));
+ }
+
+ if (electing_me) {
+ // drop out
+ acked_me.clear();
+ electing_me = false;
+ }
+
+ // ack them
+ leader_acked = who;
+ elector->_defer_to(who);
+}
+
+void ElectionLogic::end_election_period()
+{
+ ldout(cct, 5) << "election period ended" << dendl;
+
+ // did i win?
+ if (electing_me &&
+ acked_me.size() > (elector->paxos_size() / 2)) {
+ // i win
+ declare_victory();
+ } else {
+ // whoever i deferred to didn't declare victory quickly enough.
+ if (elector->ever_participated())
+ start();
+ else
+ elector->reset_election();
+ }
+}
+
+
+void ElectionLogic::declare_victory()
+{
+ ldout(cct, 5) << "I win! acked_me=" << acked_me << dendl;
+ last_election_winner = elector->get_my_rank();
+ last_voted_for = last_election_winner;
+ clear_live_election_state();
+
+ set<int> new_quorum;
+ new_quorum.swap(acked_me);
+
+ ceph_assert(epoch % 2 == 1); // election
+ bump_epoch(epoch+1); // is over!
+
+ elector->message_victory(new_quorum);
+}
+
+bool ElectionLogic::propose_classic_prefix(int from, epoch_t mepoch)
+{
+ if (mepoch > epoch) {
+ bump_epoch(mepoch);
+ } else if (mepoch < epoch) {
+ // got an "old" propose,
+ if (epoch % 2 == 0 && // in a non-election cycle
+ !elector->is_current_member(from)) { // from someone outside the quorum
+ // a mon just started up, call a new election so they can rejoin!
+ ldout(cct, 5) << " got propose from old epoch, "
+ << from << " must have just started" << dendl;
+ // we may be active; make sure we reset things in the monitor appropriately.
+ elector->trigger_new_election();
+ } else {
+ ldout(cct, 5) << " ignoring old propose" << dendl;
+ }
+ return true;
+ }
+ return false;
+}
+
+void ElectionLogic::receive_propose(int from, epoch_t mepoch,
+ const ConnectionTracker *ct)
+{
+ ldout(cct, 20) << __func__ << " from " << from << dendl;
+ if (from == elector->get_my_rank()) {
+ lderr(cct) << "I got a propose from my own rank, hopefully this is startup weirdness,dropping" << dendl;
+ return;
+ }
+ switch (strategy) {
+ case CLASSIC:
+ propose_classic_handler(from, mepoch);
+ break;
+ case DISALLOW:
+ propose_disallow_handler(from, mepoch);
+ break;
+ case CONNECTIVITY:
+ propose_connectivity_handler(from, mepoch, ct);
+ break;
+ default:
+ ceph_assert(0 == "how did election strategy become an invalid value?");
+ }
+}
+
+void ElectionLogic::propose_disallow_handler(int from, epoch_t mepoch)
+{
+ if (propose_classic_prefix(from, mepoch)) {
+ return;
+ }
+ const set<int>& disallowed_leaders = elector->get_disallowed_leaders();
+ int my_rank = elector->get_my_rank();
+ bool me_disallowed = disallowed_leaders.count(my_rank);
+ bool from_disallowed = disallowed_leaders.count(from);
+ bool my_win = !me_disallowed && // we are allowed to lead
+ (my_rank < from || from_disallowed); // we are a better choice than them
+ bool their_win = !from_disallowed && // they are allowed to lead
+ (my_rank > from || me_disallowed) && // they are a better choice than us
+ (leader_acked < 0 || leader_acked >= from); // they are a better choice than our previously-acked choice
+
+
+ if (my_win) {
+ // i would win over them.
+ if (leader_acked >= 0) { // we already acked someone
+ ceph_assert(leader_acked < from || from_disallowed); // and they still win, of course
+ ldout(cct, 5) << "no, we already acked " << leader_acked << dendl;
+ } else {
+ // wait, i should win!
+ if (!electing_me) {
+ elector->trigger_new_election();
+ }
+ }
+ } else {
+ // they would win over me
+ if (their_win) {
+ defer(from);
+ } else {
+ // ignore them!
+ ldout(cct, 5) << "no, we already acked " << leader_acked << dendl;
+ }
+ }
+}
+
+void ElectionLogic::propose_classic_handler(int from, epoch_t mepoch)
+{
+ if (propose_classic_prefix(from, mepoch)) {
+ return;
+ }
+ if (elector->get_my_rank() < from) {
+ // i would win over them.
+ if (leader_acked >= 0) { // we already acked someone
+ ceph_assert(leader_acked < from); // and they still win, of course
+ ldout(cct, 5) << "no, we already acked " << leader_acked << dendl;
+ } else {
+ // wait, i should win!
+ if (!electing_me) {
+ elector->trigger_new_election();
+ }
+ }
+ } else {
+ // they would win over me
+ if (leader_acked < 0 || // haven't acked anyone yet, or
+ leader_acked > from || // they would win over who you did ack, or
+ leader_acked == from) { // this is the guy we're already deferring to
+ defer(from);
+ } else {
+ // ignore them!
+ ldout(cct, 5) << "no, we already acked " << leader_acked << dendl;
+ }
+ }
+}
+
+double ElectionLogic::connectivity_election_score(int rank)
+{
+ ldout(cct, 30) << __func__ << " of " << rank << dendl;
+ if (elector->get_disallowed_leaders().count(rank)) {
+ return -1;
+ }
+ double score;
+ int liveness;
+ if (stable_peer_tracker) {
+ ldout(cct, 30) << "stable_peer_tracker exists so using that ..." << dendl;
+ stable_peer_tracker->get_total_connection_score(rank, &score, &liveness);
+ } else {
+ ldout(cct, 30) << "stable_peer_tracker does not exists, using peer_tracker ..." << dendl;
+ peer_tracker->get_total_connection_score(rank, &score, &liveness);
+ }
+ return score;
+}
+
+void ElectionLogic::propose_connectivity_handler(int from, epoch_t mepoch,
+ const ConnectionTracker *ct)
+{
+ ldout(cct, 10) << __func__ << " from " << from << " mepoch: "
+ << mepoch << " epoch: " << epoch << dendl;
+ ldout(cct, 30) << "last_election_winner: " << last_election_winner << dendl;
+ if ((epoch % 2 == 0) &&
+ last_election_winner != elector->get_my_rank() &&
+ !elector->is_current_member(from)) {
+ // To prevent election flapping, peons ignore proposals from out-of-quorum
+ // peers unless their vote would materially change from the last election
+ ldout(cct, 30) << "Lets see if this out-of-quorum peer is worth it " << dendl;
+ int best_scorer = 0;
+ double best_score = 0;
+ double last_voted_for_score = 0;
+ ldout(cct, 30) << "elector->paxos_size(): " << elector->paxos_size() << dendl;
+ for (unsigned i = 0; i < elector->paxos_size(); ++i) {
+ double score = connectivity_election_score(i);
+ if (score > best_score) {
+ best_scorer = i;
+ best_score = score;
+ }
+ if (last_voted_for >= 0 && i == static_cast<unsigned>(last_voted_for)) {
+ last_voted_for_score = score;
+ }
+ }
+ ldout(cct, 30) << "best_scorer: " << best_scorer << " best_score: " << best_score
+ << " last_voted_for: " << last_voted_for << " last_voted_for_score: "
+ << last_voted_for_score << dendl;
+ if (best_scorer == last_voted_for ||
+ (best_score - last_voted_for_score < ignore_propose_margin)) {
+ // drop this message; it won't change our vote so we defer to leader
+ ldout(cct, 30) << "drop this message; it won't change our vote so we defer to leader " << dendl;
+ return;
+ }
+ }
+ if (mepoch > epoch) {
+ ldout(cct, 20) << "mepoch > epoch" << dendl;
+ connectivity_bump_epoch_in_election(mepoch);
+ } else if (mepoch < epoch) {
+ // got an "old" propose,
+ if (epoch % 2 == 0 && // in a non-election cycle
+ !elector->is_current_member(from)) { // from someone outside the quorum
+ // a mon just started up, call a new election so they can rejoin!
+ ldout(cct, 5) << " got propose from old epoch, "
+ << from << " must have just started" << dendl;
+ ldout(cct, 10) << "triggering new election" << dendl;
+ // we may be active; make sure we reset things in the monitor appropriately.
+ elector->trigger_new_election();
+ } else {
+ ldout(cct, 5) << " ignoring old propose" << dendl;
+ }
+ return;
+ }
+
+ int my_rank = elector->get_my_rank();
+ double my_score = connectivity_election_score(my_rank);
+ double from_score = connectivity_election_score(from);
+ double leader_score = -1;
+ if (leader_acked >= 0) {
+ leader_score = connectivity_election_score(leader_acked);
+ }
+
+ ldout(cct, 20) << "propose from rank=" << from << ", tracker: "
+ << (stable_peer_tracker ? *stable_peer_tracker : *peer_tracker) << dendl;
+
+ ldout(cct, 10) << "propose from rank=" << from << ",from_score=" << from_score
+ << "; my score=" << my_score
+ << "; currently acked " << leader_acked
+ << ",leader_score=" << leader_score
+ << ",disallowed_leaders=" << elector->get_disallowed_leaders() << dendl;
+
+ bool my_win = (my_score >= 0) && // My score is non-zero; I am allowed to lead
+ ((my_rank < from && my_score >= from_score) || // We have same scores and I have lower rank, or
+ (my_score > from_score)); // my score is higher
+
+ bool their_win = (from_score >= 0) && // Their score is non-zero; they're allowed to lead, AND
+ ((from < my_rank && from_score >= my_score) || // Either they have lower rank and same score, or
+ (from_score > my_score)) && // their score is higher, AND
+ ((from <= leader_acked && from_score >= leader_score) || // same conditions compared to leader, or IS leader
+ (from_score > leader_score));
+
+ if (my_win) {
+ ldout(cct, 10) << " conditionally I win" << dendl;
+ // i would win over them.
+ if (leader_acked >= 0) { // we already acked someone
+ ceph_assert(leader_score >= from_score); // and they still win, of course
+ ldout(cct, 5) << "no, we already acked " << leader_acked << dendl;
+ } else {
+ // wait, i should win!
+ if (!electing_me) {
+ ldout(cct, 10) << " wait, i should win! triggering new election ..." << dendl;
+ elector->trigger_new_election();
+ }
+ }
+ } else {
+ ldout(cct, 10) << " conditionally they win" << dendl;
+ // they would win over me
+ if (their_win || from == leader_acked) {
+ if (leader_acked >= 0 && from != leader_acked) {
+ // we have to make sure our acked leader will ALSO defer to them, or else
+ // we can't, to maintain guarantees!
+ ldout(cct, 10) << " make sure acked leader defer to: " << from << dendl;
+ double leader_from_score;
+ int leader_from_liveness;
+ leader_peer_tracker->
+ get_total_connection_score(from, &leader_from_score,
+ &leader_from_liveness);
+ double leader_leader_score;
+ int leader_leader_liveness;
+ leader_peer_tracker->
+ get_total_connection_score(leader_acked, &leader_leader_score,
+ &leader_leader_liveness);
+ if ((from < leader_acked && leader_from_score >= leader_leader_score) ||
+ (leader_from_score > leader_leader_score)) {
+ ldout(cct, 10) << "defering to " << from << dendl;
+ defer(from);
+ leader_peer_tracker.reset(new ConnectionTracker(*ct));
+ } else { // we can't defer to them *this* round even though they should win...
+ double cur_leader_score, cur_from_score;
+ int cur_leader_live, cur_from_live;
+ peer_tracker->get_total_connection_score(leader_acked, &cur_leader_score, &cur_leader_live);
+ peer_tracker->get_total_connection_score(from, &cur_from_score, &cur_from_live);
+ if ((from < leader_acked && cur_from_score >= cur_leader_score) ||
+ (cur_from_score > cur_leader_score)) {
+ ldout(cct, 5) << "Bumping epoch and starting new election; acked "
+ << leader_acked << " should defer to " << from
+ << " but there is score disagreement!" << dendl;
+ bump_epoch(epoch+1);
+ start();
+ } else {
+ ldout(cct, 5) << "no, we already acked " << leader_acked
+ << " and it won't defer to " << from
+ << " despite better round scores" << dendl;
+ }
+ }
+ } else {
+ ldout(cct, 10) << "defering to " << from << dendl;
+ defer(from);
+ leader_peer_tracker.reset(new ConnectionTracker(*ct));
+ }
+ } else {
+ // ignore them!
+ ldout(cct, 5) << "no, we already acked " << leader_acked << " with score >=" << from_score << dendl;
+ }
+ }
+}
+
+void ElectionLogic::receive_ack(int from, epoch_t from_epoch)
+{
+ ceph_assert(from_epoch % 2 == 1); // sender in an election epoch
+ if (from_epoch > epoch) {
+ ldout(cct, 5) << "woah, that's a newer epoch, i must have rebooted. bumping and re-starting!" << dendl;
+ bump_epoch(from_epoch);
+ start();
+ return;
+ }
+ // is that _everyone_?
+ if (electing_me) {
+ acked_me.insert(from);
+ if (acked_me.size() == elector->paxos_size()) {
+ // if yes, shortcut to election finish
+ declare_victory();
+ }
+ } else {
+ // ignore, i'm deferring already.
+ ceph_assert(leader_acked >= 0);
+ }
+}
+
+bool ElectionLogic::victory_makes_sense(int from)
+{
+ bool makes_sense = false;
+ switch (strategy) {
+ case CLASSIC:
+ makes_sense = (from < elector->get_my_rank());
+ break;
+ case DISALLOW:
+ makes_sense = (from < elector->get_my_rank()) ||
+ elector->get_disallowed_leaders().count(elector->get_my_rank());
+ break;
+ case CONNECTIVITY:
+ double my_score, leader_score;
+ my_score = connectivity_election_score(elector->get_my_rank());
+ leader_score = connectivity_election_score(from);
+ ldout(cct, 5) << "victory from " << from << " makes sense? lscore:"
+ << leader_score
+ << "; my score:" << my_score << dendl;
+
+ makes_sense = (leader_score >= my_score);
+ break;
+ default:
+ ceph_assert(0 == "how did you get a nonsense election strategy assigned?");
+ }
+ return makes_sense;
+}
+
+bool ElectionLogic::receive_victory_claim(int from, epoch_t from_epoch)
+{
+ bool election_okay = victory_makes_sense(from);
+
+ last_election_winner = from;
+ last_voted_for = leader_acked;
+ clear_live_election_state();
+
+ if (!election_okay) {
+ ceph_assert(strategy == CONNECTIVITY);
+ ldout(cct, 1) << "I should have been elected over this leader; bumping and restarting!" << dendl;
+ bump_epoch(from_epoch);
+ start();
+ return false;
+ }
+
+ // i should have seen this election if i'm getting the victory.
+ if (from_epoch != epoch + 1) {
+ ldout(cct, 5) << "woah, that's a funny epoch, i must have rebooted. bumping and re-starting!" << dendl;
+ bump_epoch(from_epoch);
+ start();
+ return false;
+ }
+
+ bump_epoch(from_epoch);
+
+ // they win
+ return true;
+}
diff --git a/src/mon/ElectionLogic.h b/src/mon/ElectionLogic.h
new file mode 100644
index 000000000..e2f2db82a
--- /dev/null
+++ b/src/mon/ElectionLogic.h
@@ -0,0 +1,460 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_ELECTIONLOGIC_H
+#define CEPH_ELECTIONLOGIC_H
+
+#include <map>
+#include <set>
+#include "include/types.h"
+#include "ConnectionTracker.h"
+
+class ElectionOwner {
+public:
+ /**
+ * Write down the given epoch in persistent storage, such that it
+ * can later be retrieved by read_persisted_epoch even across process
+ * or machine restarts.
+ *
+ * @param e The epoch to write
+ */
+ virtual void persist_epoch(epoch_t e) = 0;
+ /**
+ * Retrieve the most-previously-persisted epoch.
+ *
+ * @returns The latest epoch passed to persist_epoch()
+ */
+ virtual epoch_t read_persisted_epoch() const = 0;
+ /**
+ * Validate that the persistent store is working by committing
+ * to it. (There is no interface for retrieving the value; this
+ * tests local functionality before doing things like triggering
+ * elections to try and join a quorum.)
+ */
+ virtual void validate_store() = 0;
+ /**
+ * Notify the ElectionOwner that ElectionLogic has increased its
+ * election epoch. This resets an election (either on local loss or victory,
+ * or when trying a new election round) and the ElectionOwner
+ * should reset any tracking of its own to match. (The ElectionLogic
+ * will further trigger sending election messages if that is
+ * appropriate.)
+ */
+ virtual void notify_bump_epoch() = 0;
+ /**
+ * Notify the ElectionOwner we must start a new election.
+ */
+ virtual void trigger_new_election() = 0;
+ /**
+ * Retrieve this Paxos instance's rank.
+ */
+ virtual int get_my_rank() const = 0;
+ /**
+ * Send a PROPOSE message to all our peers. This happens when
+ * we have started a new election (which may mean attempting to
+ * override a current one).
+ *
+ * @param e The election epoch of our proposal.
+ * @param bl A bufferlist containing data the logic wishes to share
+ */
+ virtual void propose_to_peers(epoch_t e, bufferlist& bl) = 0;
+ /**
+ * The election has failed and we aren't sure what the state of the
+ * quorum is, so reset the entire system as if from scratch.
+ */
+ virtual void reset_election() = 0;
+ /**
+ * Ask the ElectionOwner if we-the-Monitor have ever participated in the
+ * quorum (including across process restarts!).
+ *
+ * @returns true if we have participated, false otherwise
+ */
+ virtual bool ever_participated() const = 0;
+ /**
+ * Ask the ElectionOwner for the size of the Paxos set. This includes
+ * those monitors which may not be in the current quorum!
+ * The value returned by this function can change between elections,
+ * but not during them. (In practical terms, it can be updated
+ * by making a paxos commit, but not by injecting values while
+ * an election is ongoing.)
+ */
+ virtual unsigned paxos_size() const = 0;
+ /**
+ * Retrieve a set of ranks which are not allowed to become the leader.
+ * Like paxos_size(), This set can change between elections, but not
+ * during them.
+ */
+ virtual const std::set<int>& get_disallowed_leaders() const = 0;
+ /**
+ * Tell the ElectionOwner we have started a new election.
+ *
+ * The ElectionOwner is responsible for timing out the election (by invoking
+ * end_election_period()) if it takes too long (as defined by the ElectionOwner).
+ * This function is the opportunity to do that and to clean up any other external
+ * election state it may be maintaining.
+ */
+ virtual void _start() = 0;
+ /**
+ * Tell the ElectionOwner to defer to the identified peer. Tell that peer
+ * we have deferred to it.
+ *
+ * @post we sent an ack message to @p who
+ */
+ virtual void _defer_to(int who) = 0;
+ /**
+ * We have won an election, so have the ElectionOwner message that to
+ * our new quorum!
+ *
+ * @param quorum The ranks of our peers which deferred to us and
+ * must be told of our victory
+ */
+ virtual void message_victory(const std::set<int>& quorum) = 0;
+ /**
+ * Query the ElectionOwner about if a given rank is in the
+ * currently active quorum.
+ * @param rank the Paxos rank whose status we are checking
+ * @returns true if the rank is in our current quorum, false otherwise.
+ */
+ virtual bool is_current_member(int rank) const = 0;
+ virtual ~ElectionOwner() {}
+};
+
+/**
+ * This class maintains local state for running an election
+ * between Paxos instances. It receives input requests
+ * and calls back out to its ElectionOwner to do persistence
+ * and message other entities.
+ */
+
+class ElectionLogic {
+ ElectionOwner *elector;
+ ConnectionTracker *peer_tracker;
+
+ CephContext *cct;
+ /**
+ * Latest epoch we've seen.
+ *
+ * @remarks if its value is odd, we're electing; if it's even, then we're
+ * stable.
+ */
+ epoch_t epoch = 0;
+ /**
+ * The last rank which won an election we participated in
+ */
+ int last_election_winner = -1;
+ /**
+ * Only used in the connectivity handler.
+ * The rank we voted for in the last election we voted in.
+ */
+ int last_voted_for = -1;
+ double ignore_propose_margin = 0.0001;
+ /**
+ * Only used in the connectivity handler.
+ * Points at a stable copy of the peer_tracker we use to keep scores
+ * throughout an election period.
+ */
+ std::unique_ptr<ConnectionTracker> stable_peer_tracker;
+ std::unique_ptr<ConnectionTracker> leader_peer_tracker;
+ /**
+ * Indicates who we have acked
+ */
+ int leader_acked;
+
+public:
+ enum election_strategy {
+ // Keep in sync with MonMap.h!
+ CLASSIC = 1, // the original rank-based one
+ DISALLOW = 2, // disallow a set from being leader
+ CONNECTIVITY = 3 // includes DISALLOW, extends to prefer stronger connections
+ };
+ election_strategy strategy;
+
+ /**
+ * Indicates if we are participating in the quorum.
+ *
+ * @remarks By default, we are created as participating. We may stop
+ * participating if something explicitly sets our value
+ * false, though. If that happens, it will
+ * have to set participating=true and invoke start() for us to resume
+ * participating in the quorum.
+ */
+ bool participating;
+ /**
+ * Indicates if we are the ones being elected.
+ *
+ * We always attempt to be the one being elected if we are the ones starting
+ * the election. If we are not the ones that started it, we will only attempt
+ * to be elected if we think we might have a chance (i.e., the other guy's
+ * rank is lower than ours).
+ */
+ bool electing_me;
+ /**
+ * Set containing all those that acked our proposal to become the Leader.
+ *
+ * If we are acked by ElectionOwner::paxos_size() peers, we will declare
+ * victory.
+ */
+ std::set<int> acked_me;
+
+ ElectionLogic(ElectionOwner *e, election_strategy es, ConnectionTracker *t,
+ double ipm,
+ CephContext *c) : elector(e), peer_tracker(t), cct(c),
+ last_election_winner(-1), last_voted_for(-1),
+ ignore_propose_margin(ipm),
+ stable_peer_tracker(),
+ leader_peer_tracker(),
+ leader_acked(-1),
+ strategy(es),
+ participating(true),
+ electing_me(false) {}
+ /**
+ * Set the election strategy to use. If this is not consistent across the
+ * electing cluster, you're going to have a bad time.
+ * Defaults to CLASSIC.
+ */
+ void set_election_strategy(election_strategy es) {
+ strategy = es;
+ }
+ /**
+ * If there are no other peers in this Paxos group, ElectionOwner
+ * can simply declare victory and we will make it so.
+ *
+ * @pre paxos_size() is 1
+ * @pre get_my_rank is 0
+ */
+ void declare_standalone_victory();
+ /**
+ * Start a new election by proposing ourselves as the new Leader.
+ *
+ * Basically, send propose messages to all the peers.
+ *
+ * @pre participating is true
+ * @post epoch is an odd value
+ * @post electing_me is true
+ * @post We have invoked propose_to_peers() on our ElectionOwner
+ * @post We have invoked _start() on our ElectionOwner
+ */
+ void start();
+ /**
+ * ElectionOwner has decided the election has taken too long and expired.
+ *
+ * This will happen when no one declared victory or started a new election
+ * during the allowed time span.
+ *
+ * When the election expires, we will check if we were the ones who won, and
+ * if so we will declare victory. If that is not the case, then we assume
+ * that the one we deferred to didn't declare victory quickly enough (in fact,
+ * as far as we know, it may even be dead); so, just propose ourselves as the
+ * Leader.
+ */
+ void end_election_period();
+ /**
+ * Handle a proposal from some other node proposing asking to become
+ * the Leader.
+ *
+ * If the message appears to be old (i.e., its epoch is lower than our epoch),
+ * then we may take one of two actions:
+ *
+ * @li Ignore it because it's nothing more than an old proposal
+ * @li Start new elections if we verify that it was sent by a monitor from
+ * outside the quorum; given its old state, it's fair to assume it just
+ * started, so we should start new elections so it may rejoin. (Some
+ * handlers may choose to ignore even these, if they think it's flapping.)
+ *
+ * We pass the propose off to a propose_*_handler function based
+ * on the election strategy we're using.
+ * Only the Connectivity strategy cares about the ConnectionTracker; it should
+ * be NULL if other strategies are in use. Otherwise, it will take ownership
+ * of the underlying data and delete it as needed.
+ *
+ * @pre Message epoch is from the current or a newer epoch
+ * @param mepoch The epoch of the proposal
+ * @param from The rank proposing itself as leader
+ * @param ct Any incoming ConnectionTracker data sent with the message.
+ * Callers are responsible for deleting this -- we will copy it if we want
+ * to keep the data.
+ */
+ void receive_propose(int from, epoch_t mepoch, const ConnectionTracker *ct);
+ /**
+ * Handle a message from some other participant Acking us as the Leader.
+ *
+ * When we receive such a message, one of three thing may be happening:
+ * @li We received a message with a newer epoch, which means we must have
+ * somehow lost track of what was going on (maybe we rebooted), thus we
+ * will start a new election
+ * @li We consider ourselves in the run for the Leader (i.e., @p electing_me
+ * is true), and we are actually being Acked by someone; thus simply add
+ * the one acking us to the @p acked_me set. If we do now have acks from
+ * all the participants, then we can declare victory
+ * @li We already deferred the election to somebody else, so we will just
+ * ignore this message
+ *
+ * @pre Message epoch is from the current or a newer epoch
+ * @post Election is on-going if we deferred to somebody else
+ * @post Election is on-going if we are still waiting for further Acks
+ * @post Election is not on-going if we are victorious
+ * @post Election is not on-going if we must start a new one
+ *
+ * @param from The rank which acked us
+ * @param from_epoch The election epoch the ack belongs to
+ */
+ void receive_ack(int from, epoch_t from_epoch);
+ /**
+ * Handle a message from some other participant declaring Victory.
+ *
+ * We just got a message from someone declaring themselves Victorious, thus
+ * the new Leader.
+ *
+ * However, if the message's epoch happens to be different from our epoch+1,
+ * then it means we lost track of something and we must start a new election.
+ *
+ * If that is not the case, then we will simply update our epoch to the one
+ * in the message and invoke start() to reset the quorum.
+ *
+ * @pre from_epoch is the current or a newer epoch
+ * @post Election is not on-going
+ * @post Updated @p epoch
+ * @post We are a peon in a new quorum if we lost the election
+ *
+ * @param from The victory-claiming rank
+ * @param from_epoch The election epoch in which they claim victory
+ */
+ bool receive_victory_claim(int from, epoch_t from_epoch);
+ /**
+ * Obtain our epoch
+ *
+ * @returns Our current epoch number
+ */
+ epoch_t get_epoch() const { return epoch; }
+ int get_election_winner() { return last_election_winner; }
+
+private:
+ /**
+ * Initiate the ElectionLogic class.
+ *
+ * Basically, we will simply read whatever epoch value we have in our stable
+ * storage, or consider it to be 1 if none is read.
+ *
+ * @post @p epoch is set to 1 or higher.
+ */
+ void init();
+ /**
+ * Update our epoch.
+ *
+ * If we come across a higher epoch, we simply update ours, also making
+ * sure we are no longer being elected (even though we could have been,
+ * we no longer are since we no longer are on that old epoch).
+ *
+ * @pre Our epoch is not larger than @p e
+ * @post Our epoch equals @p e
+ *
+ * @param e Epoch to which we will update our epoch
+ */
+ void bump_epoch(epoch_t e);
+ /**
+ * If the incoming proposal is newer, bump our own epoch; if
+ * it comes from an out-of-quorum peer, trigger a new eleciton.
+ * @returns true if you should drop this proposal, false otherwise.
+ */
+ bool propose_classic_prefix(int from, epoch_t mepoch);
+ /**
+ * Handle a proposal from another rank using the classic strategy.
+ * We will take one of the following actions:
+ *
+ * @li Ignore it because we already acked another node with higher rank
+ * @li Ignore it and start a new election because we outrank it
+ * @li Defer to it because it outranks us and the node we previously
+ * acked, if any
+ */
+ void propose_classic_handler(int from, epoch_t mepoch);
+ /**
+ * Handle a proposal from another rank using our disallow strategy.
+ * This is the same as the classic strategy except we also disallow
+ * certain ranks from becoming the leader.
+ */
+ void propose_disallow_handler(int from, epoch_t mepoch);
+ /**
+ * Handle a proposal from another rank using the connectivity strategy.
+ * We will choose to defer or not based on the ordered criteria:
+ *
+ * @li Whether the other monitor (or ourself) is on the disallow list
+ * @li Whether the other monitor or ourself has the most connectivity to peers
+ * @li Whether the other monitor or ourself has the lower rank
+ */
+ void propose_connectivity_handler(int from, epoch_t mepoch, const ConnectionTracker *ct);
+ /**
+ * Helper function for connectivity handler. Combines the disallowed list
+ * with ConnectionTracker scores.
+ */
+ double connectivity_election_score(int rank);
+ /**
+ * Defer the current election to some other monitor.
+ *
+ * This means that we will ack some other monitor and drop out from the run
+ * to become the Leader. We will only defer an election if the monitor we
+ * are deferring to outranks us.
+ *
+ * @pre @p who outranks us (i.e., who < our rank)
+ * @pre @p who outranks any other monitor we have deferred to in the past
+ * @post electing_me is false
+ * @post leader_acked equals @p who
+ * @post we triggered ElectionOwner's _defer_to() on @p who
+ *
+ * @param who Some other monitor's numeric identifier.
+ */
+ void defer(int who);
+ /**
+ * Declare Victory.
+ *
+ * We won. Or at least we believe we won, but for all intents and purposes
+ * that does not matter. What matters is that we Won.
+ *
+ * That said, we must now bump our epoch to reflect that the election is over
+ * and then we must let everybody in the quorum know we are their brand new
+ * Leader.
+ *
+ * Actually, the quorum will be now defined as the group of monitors that
+ * acked us during the election process.
+ *
+ * @pre Election is on-going
+ * @pre electing_me is true
+ * @post electing_me is false
+ * @post epoch is bumped up into an even value
+ * @post Election is not on-going
+ * @post We have a quorum, composed of the monitors that acked us
+ * @post We invoked message_victory() on the ElectionOwner
+ */
+ void declare_victory();
+ /**
+ * This is just a helper function to validate that the victory claim we
+ * get from another rank makes any sense.
+ */
+ bool victory_makes_sense(int from);
+ /**
+ * Reset some data members which we only care about while we are in an election
+ * or need to be set consistently during stable states.
+ */
+ void clear_live_election_state();
+ void reset_stable_tracker();
+ /**
+ * Only for the connectivity handler, Bump the epoch
+ * when we get a message from a newer one and clear
+ * out leader and stable tracker
+ * data so that we can switch our allegiance.
+ */
+ void connectivity_bump_epoch_in_election(epoch_t mepoch);
+};
+
+#endif
diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc
new file mode 100644
index 000000000..5ad30ff22
--- /dev/null
+++ b/src/mon/Elector.cc
@@ -0,0 +1,807 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "Elector.h"
+#include "Monitor.h"
+
+#include "common/Timer.h"
+#include "MonitorDBStore.h"
+#include "messages/MMonElection.h"
+#include "messages/MMonPing.h"
+
+#include "common/config.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, get_epoch())
+using std::cerr;
+using std::cout;
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::setfill;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+using std::unique_ptr;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+static ostream& _prefix(std::ostream *_dout, Monitor *mon, epoch_t epoch) {
+ return *_dout << "mon." << mon->name << "@" << mon->rank
+ << "(" << mon->get_state_name()
+ << ").elector(" << epoch << ") ";
+}
+
+Elector::Elector(Monitor *m, int strategy) : logic(this, static_cast<ElectionLogic::election_strategy>(strategy),
+ &peer_tracker,
+ m->cct->_conf.get_val<double>("mon_elector_ignore_propose_margin"),
+ m->cct),
+ peer_tracker(this, m->rank,
+ m->cct->_conf.get_val<uint64_t>("mon_con_tracker_score_halflife"),
+ m->cct->_conf.get_val<uint64_t>("mon_con_tracker_persist_interval"), m->cct),
+ ping_timeout(m->cct->_conf.get_val<double>("mon_elector_ping_timeout")),
+ PING_DIVISOR(m->cct->_conf.get_val<uint64_t>("mon_elector_ping_divisor")),
+ mon(m), elector(this) {
+ bufferlist bl;
+ mon->store->get(Monitor::MONITOR_NAME, "connectivity_scores", bl);
+ if (bl.length()) {
+ bufferlist::const_iterator bi = bl.begin();
+ peer_tracker.decode(bi);
+ }
+}
+
+
+void Elector::persist_epoch(epoch_t e)
+{
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ t->put(Monitor::MONITOR_NAME, "election_epoch", e);
+ t->put(Monitor::MONITOR_NAME, "connectivity_scores", peer_tracker.get_encoded_bl());
+ mon->store->apply_transaction(t);
+}
+
+void Elector::persist_connectivity_scores()
+{
+ dout(20) << __func__ << dendl;
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ t->put(Monitor::MONITOR_NAME, "connectivity_scores", peer_tracker.get_encoded_bl());
+ mon->store->apply_transaction(t);
+}
+
+epoch_t Elector::read_persisted_epoch() const
+{
+ return mon->store->get(Monitor::MONITOR_NAME, "election_epoch");
+}
+
+void Elector::validate_store()
+{
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ t->put(Monitor::MONITOR_NAME, "election_writeable_test", rand());
+ int r = mon->store->apply_transaction(t);
+ ceph_assert(r >= 0);
+}
+
+bool Elector::is_current_member(int rank) const
+{
+ return mon->quorum.count(rank);
+}
+
+void Elector::trigger_new_election()
+{
+ mon->start_election();
+}
+
+int Elector::get_my_rank() const
+{
+ return mon->rank;
+}
+
+void Elector::reset_election()
+{
+ mon->bootstrap();
+}
+
+bool Elector::ever_participated() const
+{
+ return mon->has_ever_joined;
+}
+
+unsigned Elector::paxos_size() const
+{
+ return mon->monmap->size();
+}
+
+void Elector::shutdown()
+{
+ cancel_timer();
+}
+
+void Elector::notify_bump_epoch()
+{
+ mon->join_election();
+}
+
+void Elector::propose_to_peers(epoch_t e, bufferlist& logic_bl)
+{
+ // bcast to everyone else
+ for (unsigned i=0; i<mon->monmap->size(); ++i) {
+ if ((int)i == mon->rank) continue;
+ MMonElection *m =
+ new MMonElection(MMonElection::OP_PROPOSE, e,
+ peer_tracker.get_encoded_bl(),
+ logic.strategy, mon->monmap);
+ m->sharing_bl = logic_bl;
+ m->mon_features = ceph::features::mon::get_supported();
+ m->mon_release = ceph_release();
+ mon->send_mon_message(m, i);
+ }
+}
+
+void Elector::_start()
+{
+ peer_info.clear();
+ peer_info[mon->rank].cluster_features = CEPH_FEATURES_ALL;
+ peer_info[mon->rank].mon_release = ceph_release();
+ peer_info[mon->rank].mon_features = ceph::features::mon::get_supported();
+ mon->collect_metadata(&peer_info[mon->rank].metadata);
+ reset_timer();
+}
+
+void Elector::_defer_to(int who)
+{
+ MMonElection *m = new MMonElection(MMonElection::OP_ACK, get_epoch(),
+ peer_tracker.get_encoded_bl(),
+ logic.strategy, mon->monmap);
+ m->mon_features = ceph::features::mon::get_supported();
+ m->mon_release = ceph_release();
+ mon->collect_metadata(&m->metadata);
+
+ mon->send_mon_message(m, who);
+
+ // set a timer
+ reset_timer(1.0); // give the leader some extra time to declare victory
+}
+
+
+void Elector::reset_timer(double plus)
+{
+ // set the timer
+ cancel_timer();
+ /**
+ * This class is used as the callback when the expire_event timer fires up.
+ *
+ * If the expire_event is fired, then it means that we had an election going,
+ * either started by us or by some other participant, but it took too long,
+ * thus expiring.
+ *
+ * When the election expires, we will check if we were the ones who won, and
+ * if so we will declare victory. If that is not the case, then we assume
+ * that the one we defered to didn't declare victory quickly enough (in fact,
+ * as far as we know, we may even be dead); so, just propose ourselves as the
+ * Leader.
+ */
+ expire_event = mon->timer.add_event_after(
+ g_conf()->mon_election_timeout + plus,
+ new C_MonContext{mon, [this](int) {
+ logic.end_election_period();
+ }});
+}
+
+
+void Elector::cancel_timer()
+{
+ if (expire_event) {
+ mon->timer.cancel_event(expire_event);
+ expire_event = 0;
+ }
+}
+
+void Elector::assimilate_connection_reports(const bufferlist& tbl)
+{
+ dout(10) << __func__ << dendl;
+ ConnectionTracker pct(tbl, mon->cct);
+ peer_tracker.receive_peer_report(pct);
+}
+
+void Elector::message_victory(const std::set<int>& quorum)
+{
+ uint64_t cluster_features = CEPH_FEATURES_ALL;
+ mon_feature_t mon_features = ceph::features::mon::get_supported();
+ map<int,Metadata> metadata;
+ ceph_release_t min_mon_release{ceph_release_t::unknown};
+ for (auto id : quorum) {
+ auto i = peer_info.find(id);
+ ceph_assert(i != peer_info.end());
+ auto& info = i->second;
+ cluster_features &= info.cluster_features;
+ mon_features &= info.mon_features;
+ metadata[id] = info.metadata;
+ if (min_mon_release == ceph_release_t::unknown ||
+ info.mon_release < min_mon_release) {
+ min_mon_release = info.mon_release;
+ }
+ }
+
+ cancel_timer();
+
+
+ // tell everyone!
+ for (set<int>::iterator p = quorum.begin();
+ p != quorum.end();
+ ++p) {
+ if (*p == mon->rank) continue;
+ MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, get_epoch(),
+ peer_tracker.get_encoded_bl(),
+ logic.strategy, mon->monmap);
+ m->quorum = quorum;
+ m->quorum_features = cluster_features;
+ m->mon_features = mon_features;
+ m->sharing_bl = mon->get_local_commands_bl(mon_features);
+ m->mon_release = min_mon_release;
+ mon->send_mon_message(m, *p);
+ }
+
+ // tell monitor
+ mon->win_election(get_epoch(), quorum,
+ cluster_features, mon_features, min_mon_release,
+ metadata);
+}
+
+
+void Elector::handle_propose(MonOpRequestRef op)
+{
+ op->mark_event("elector:handle_propose");
+ auto m = op->get_req<MMonElection>();
+ dout(5) << "handle_propose from " << m->get_source() << dendl;
+ int from = m->get_source().num();
+
+ ceph_assert(m->epoch % 2 == 1); // election
+ uint64_t required_features = mon->get_required_features();
+ mon_feature_t required_mon_features = mon->get_required_mon_features();
+
+ dout(10) << __func__ << " required features " << required_features
+ << " " << required_mon_features
+ << ", peer features " << m->get_connection()->get_features()
+ << " " << m->mon_features
+ << dendl;
+
+ if ((required_features ^ m->get_connection()->get_features()) &
+ required_features) {
+ dout(5) << " ignoring propose from mon" << from
+ << " without required features" << dendl;
+ nak_old_peer(op);
+ return;
+ } else if (mon->monmap->min_mon_release > m->mon_release) {
+ dout(5) << " ignoring propose from mon" << from
+ << " release " << (int)m->mon_release
+ << " < min_mon_release " << (int)mon->monmap->min_mon_release
+ << dendl;
+ nak_old_peer(op);
+ return;
+ } else if (!m->mon_features.contains_all(required_mon_features)) {
+ // all the features in 'required_mon_features' not in 'm->mon_features'
+ mon_feature_t missing = required_mon_features.diff(m->mon_features);
+ dout(5) << " ignoring propose from mon." << from
+ << " without required mon_features " << missing
+ << dendl;
+ nak_old_peer(op);
+ }
+ ConnectionTracker *oct = NULL;
+ if (m->sharing_bl.length()) {
+ oct = new ConnectionTracker(m->sharing_bl, mon->cct);
+ }
+ logic.receive_propose(from, m->epoch, oct);
+ delete oct;
+}
+
+void Elector::handle_ack(MonOpRequestRef op)
+{
+ op->mark_event("elector:handle_ack");
+ auto m = op->get_req<MMonElection>();
+ dout(5) << "handle_ack from " << m->get_source() << dendl;
+ int from = m->get_source().num();
+
+ ceph_assert(m->epoch == get_epoch());
+ uint64_t required_features = mon->get_required_features();
+ if ((required_features ^ m->get_connection()->get_features()) &
+ required_features) {
+ dout(5) << " ignoring ack from mon" << from
+ << " without required features" << dendl;
+ return;
+ }
+
+ mon_feature_t required_mon_features = mon->get_required_mon_features();
+ if (!m->mon_features.contains_all(required_mon_features)) {
+ mon_feature_t missing = required_mon_features.diff(m->mon_features);
+ dout(5) << " ignoring ack from mon." << from
+ << " without required mon_features " << missing
+ << dendl;
+ return;
+ }
+
+ if (logic.electing_me) {
+ // thanks
+ peer_info[from].cluster_features = m->get_connection()->get_features();
+ peer_info[from].mon_features = m->mon_features;
+ peer_info[from].mon_release = m->mon_release;
+ peer_info[from].metadata = m->metadata;
+ dout(5) << " so far i have {";
+ for (auto q = logic.acked_me.begin();
+ q != logic.acked_me.end();
+ ++q) {
+ auto p = peer_info.find(*q);
+ ceph_assert(p != peer_info.end());
+ if (q != logic.acked_me.begin())
+ *_dout << ",";
+ *_dout << " mon." << p->first << ":"
+ << " features " << p->second.cluster_features
+ << " " << p->second.mon_features;
+ }
+ *_dout << " }" << dendl;
+ }
+
+ logic.receive_ack(from, m->epoch);
+}
+
+void Elector::handle_victory(MonOpRequestRef op)
+{
+ op->mark_event("elector:handle_victory");
+ auto m = op->get_req<MMonElection>();
+ dout(5) << "handle_victory from " << m->get_source()
+ << " quorum_features " << m->quorum_features
+ << " " << m->mon_features
+ << dendl;
+ int from = m->get_source().num();
+
+ bool accept_victory = logic.receive_victory_claim(from, m->epoch);
+
+ if (!accept_victory) {
+ return;
+ }
+
+ mon->lose_election(get_epoch(), m->quorum, from,
+ m->quorum_features, m->mon_features, m->mon_release);
+
+ // cancel my timer
+ cancel_timer();
+
+ // stash leader's commands
+ ceph_assert(m->sharing_bl.length());
+ vector<MonCommand> new_cmds;
+ auto bi = m->sharing_bl.cbegin();
+ MonCommand::decode_vector(new_cmds, bi);
+ mon->set_leader_commands(new_cmds);
+}
+
+void Elector::nak_old_peer(MonOpRequestRef op)
+{
+ op->mark_event("elector:nak_old_peer");
+ auto m = op->get_req<MMonElection>();
+ uint64_t supported_features = m->get_connection()->get_features();
+ uint64_t required_features = mon->get_required_features();
+ mon_feature_t required_mon_features = mon->get_required_mon_features();
+ dout(10) << "sending nak to peer " << m->get_source()
+ << " supports " << supported_features << " " << m->mon_features
+ << ", required " << required_features << " " << required_mon_features
+ << ", release " << (int)m->mon_release
+ << " vs required " << (int)mon->monmap->min_mon_release
+ << dendl;
+ MMonElection *reply = new MMonElection(MMonElection::OP_NAK, m->epoch,
+ peer_tracker.get_encoded_bl(),
+ logic.strategy, mon->monmap);
+ reply->quorum_features = required_features;
+ reply->mon_features = required_mon_features;
+ reply->mon_release = mon->monmap->min_mon_release;
+ mon->features.encode(reply->sharing_bl);
+ m->get_connection()->send_message(reply);
+}
+
+void Elector::handle_nak(MonOpRequestRef op)
+{
+ op->mark_event("elector:handle_nak");
+ auto m = op->get_req<MMonElection>();
+ dout(1) << "handle_nak from " << m->get_source()
+ << " quorum_features " << m->quorum_features
+ << " " << m->mon_features
+ << " min_mon_release " << (int)m->mon_release
+ << dendl;
+
+ if (m->mon_release > ceph_release()) {
+ derr << "Shutting down because I am release " << (int)ceph_release()
+ << " < min_mon_release " << (int)m->mon_release << dendl;
+ } else {
+ CompatSet other;
+ auto bi = m->sharing_bl.cbegin();
+ other.decode(bi);
+ CompatSet diff = Monitor::get_supported_features().unsupported(other);
+
+ mon_feature_t mon_supported = ceph::features::mon::get_supported();
+ // all features in 'm->mon_features' not in 'mon_supported'
+ mon_feature_t mon_diff = m->mon_features.diff(mon_supported);
+
+ derr << "Shutting down because I lack required monitor features: { "
+ << diff << " } " << mon_diff << dendl;
+ }
+ exit(0);
+ // the end!
+}
+
+void Elector::begin_peer_ping(int peer)
+{
+ dout(20) << __func__ << " against " << peer << dendl;
+ if (live_pinging.count(peer)) {
+ dout(20) << peer << " already in live_pinging ... return " << dendl;
+ return;
+ }
+
+ if (!mon->get_quorum_mon_features().contains_all(
+ ceph::features::mon::FEATURE_PINGING)) {
+ return;
+ }
+
+ peer_tracker.report_live_connection(peer, 0); // init this peer as existing
+ live_pinging.insert(peer);
+ dead_pinging.erase(peer);
+ peer_acked_ping[peer] = ceph_clock_now();
+ if (!send_peer_ping(peer)) return;
+ mon->timer.add_event_after(ping_timeout / PING_DIVISOR,
+ new C_MonContext{mon, [this, peer](int) {
+ ping_check(peer);
+ }});
+}
+
+bool Elector::send_peer_ping(int peer, const utime_t *n)
+{
+ dout(10) << __func__ << " to peer " << peer << dendl;
+ if (peer >= ssize(mon->monmap->ranks)) {
+ // Monitor no longer exists in the monmap,
+ // therefore, we shouldn't ping this monitor
+ // since we cannot lookup the address!
+ dout(5) << "peer: " << peer << " >= ranks_size: "
+ << ssize(mon->monmap->ranks) << " ... dropping to prevent "
+ << "https://tracker.ceph.com/issues/50089" << dendl;
+ live_pinging.erase(peer);
+ return false;
+ }
+ utime_t now;
+ if (n != NULL) {
+ now = *n;
+ } else {
+ now = ceph_clock_now();
+ }
+ MMonPing *ping = new MMonPing(MMonPing::PING, now, peer_tracker.get_encoded_bl());
+ mon->messenger->send_to_mon(ping, mon->monmap->get_addrs(peer));
+ peer_sent_ping[peer] = now;
+ return true;
+}
+
+void Elector::ping_check(int peer)
+{
+ dout(20) << __func__ << " to peer " << peer << dendl;
+
+ if (!live_pinging.count(peer) &&
+ !dead_pinging.count(peer)) {
+ dout(20) << __func__ << peer << " is no longer marked for pinging" << dendl;
+ return;
+ }
+ utime_t now = ceph_clock_now();
+ utime_t& acked_ping = peer_acked_ping[peer];
+ utime_t& newest_ping = peer_sent_ping[peer];
+ if (!acked_ping.is_zero() && acked_ping < now - ping_timeout) {
+ peer_tracker.report_dead_connection(peer, now - acked_ping);
+ acked_ping = now;
+ begin_dead_ping(peer);
+ return;
+ }
+
+ if (acked_ping == newest_ping) {
+ if (!send_peer_ping(peer, &now)) return;
+ }
+
+ mon->timer.add_event_after(ping_timeout / PING_DIVISOR,
+ new C_MonContext{mon, [this, peer](int) {
+ ping_check(peer);
+ }});
+}
+
+void Elector::begin_dead_ping(int peer)
+{
+ dout(20) << __func__ << " to peer " << peer << dendl;
+ if (dead_pinging.count(peer)) {
+ return;
+ }
+
+ live_pinging.erase(peer);
+ dead_pinging.insert(peer);
+ mon->timer.add_event_after(ping_timeout,
+ new C_MonContext{mon, [this, peer](int) {
+ dead_ping(peer);
+ }});
+}
+
+void Elector::dead_ping(int peer)
+{
+ dout(20) << __func__ << " to peer " << peer << dendl;
+ if (!dead_pinging.count(peer)) {
+ dout(20) << __func__ << peer << " is no longer marked for dead pinging" << dendl;
+ return;
+ }
+ ceph_assert(!live_pinging.count(peer));
+
+ utime_t now = ceph_clock_now();
+ utime_t& acked_ping = peer_acked_ping[peer];
+
+ peer_tracker.report_dead_connection(peer, now - acked_ping);
+ acked_ping = now;
+ mon->timer.add_event_after(ping_timeout,
+ new C_MonContext{mon, [this, peer](int) {
+ dead_ping(peer);
+ }});
+}
+
+void Elector::handle_ping(MonOpRequestRef op)
+{
+ MMonPing *m = static_cast<MMonPing*>(op->get_req());
+ int prank = mon->monmap->get_rank(m->get_source_addr());
+ dout(20) << __func__ << " from: " << prank << dendl;
+ begin_peer_ping(prank);
+ assimilate_connection_reports(m->tracker_bl);
+ switch(m->op) {
+ case MMonPing::PING:
+ {
+ MMonPing *reply = new MMonPing(MMonPing::PING_REPLY, m->stamp, peer_tracker.get_encoded_bl());
+ m->get_connection()->send_message(reply);
+ }
+ break;
+
+ case MMonPing::PING_REPLY:
+
+ const utime_t& previous_acked = peer_acked_ping[prank];
+ const utime_t& newest = peer_sent_ping[prank];
+
+ if (m->stamp > newest && !newest.is_zero()) {
+ derr << "dropping PING_REPLY stamp " << m->stamp
+ << " as it is newer than newest sent " << newest << dendl;
+ return;
+ }
+
+ if (m->stamp > previous_acked) {
+ dout(20) << "m->stamp > previous_acked" << dendl;
+ peer_tracker.report_live_connection(prank, m->stamp - previous_acked);
+ peer_acked_ping[prank] = m->stamp;
+ } else{
+ dout(20) << "m->stamp <= previous_acked .. we don't report_live_connection" << dendl;
+ }
+ utime_t now = ceph_clock_now();
+ dout(30) << "now: " << now << " m->stamp: " << m->stamp << " ping_timeout: "
+ << ping_timeout << " PING_DIVISOR: " << PING_DIVISOR << dendl;
+ if (now - m->stamp > ping_timeout / PING_DIVISOR) {
+ if (!send_peer_ping(prank, &now)) return;
+ }
+ break;
+ }
+}
+
+void Elector::dispatch(MonOpRequestRef op)
+{
+ op->mark_event("elector:dispatch");
+ ceph_assert(op->is_type_election_or_ping());
+
+ switch (op->get_req()->get_type()) {
+
+ case MSG_MON_ELECTION:
+ {
+ if (!logic.participating) {
+ return;
+ }
+ if (op->get_req()->get_source().num() >= mon->monmap->size()) {
+ dout(5) << " ignoring bogus election message with bad mon rank "
+ << op->get_req()->get_source() << dendl;
+ return;
+ }
+
+ auto em = op->get_req<MMonElection>();
+ dout(20) << __func__ << " from: " << mon->monmap->get_rank(em->get_source_addr()) << dendl;
+ // assume an old message encoding would have matched
+ if (em->fsid != mon->monmap->fsid) {
+ dout(0) << " ignoring election msg fsid "
+ << em->fsid << " != " << mon->monmap->fsid << dendl;
+ return;
+ }
+
+ if (!mon->monmap->contains(em->get_source_addr())) {
+ dout(1) << "discarding election message: " << em->get_source_addr()
+ << " not in my monmap " << *mon->monmap << dendl;
+ return;
+ }
+
+ MonMap peermap;
+ peermap.decode(em->monmap_bl);
+ if (peermap.epoch > mon->monmap->epoch) {
+ dout(0) << em->get_source_inst() << " has newer monmap epoch " << peermap.epoch
+ << " > my epoch " << mon->monmap->epoch
+ << ", taking it"
+ << dendl;
+ mon->monmap->decode(em->monmap_bl);
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ t->put("monmap", mon->monmap->epoch, em->monmap_bl);
+ t->put("monmap", "last_committed", mon->monmap->epoch);
+ mon->store->apply_transaction(t);
+ //mon->monmon()->paxos->stash_latest(mon->monmap->epoch, em->monmap_bl);
+ cancel_timer();
+ mon->notify_new_monmap(false);
+ mon->bootstrap();
+ return;
+ }
+ if (peermap.epoch < mon->monmap->epoch) {
+ dout(0) << em->get_source_inst() << " has older monmap epoch " << peermap.epoch
+ << " < my epoch " << mon->monmap->epoch
+ << dendl;
+ }
+
+ if (em->strategy != logic.strategy) {
+ dout(5) << __func__ << " somehow got an Election message with different strategy "
+ << em->strategy << " from local " << logic.strategy
+ << "; dropping for now to let race resolve" << dendl;
+ return;
+ }
+
+ if (em->scoring_bl.length()) {
+ assimilate_connection_reports(em->scoring_bl);
+ }
+
+ begin_peer_ping(mon->monmap->get_rank(em->get_source_addr()));
+ switch (em->op) {
+ case MMonElection::OP_PROPOSE:
+ handle_propose(op);
+ return;
+ }
+
+ if (em->epoch < get_epoch()) {
+ dout(5) << "old epoch, dropping" << dendl;
+ break;
+ }
+
+ switch (em->op) {
+ case MMonElection::OP_ACK:
+ handle_ack(op);
+ return;
+ case MMonElection::OP_VICTORY:
+ handle_victory(op);
+ return;
+ case MMonElection::OP_NAK:
+ handle_nak(op);
+ return;
+ default:
+ ceph_abort();
+ }
+ }
+ break;
+
+ case MSG_MON_PING:
+ handle_ping(op);
+ break;
+
+ default:
+ ceph_abort();
+ }
+}
+
+void Elector::start_participating()
+{
+ logic.participating = true;
+}
+
+bool Elector::peer_tracker_is_clean()
+{
+ return peer_tracker.is_clean(mon->rank, paxos_size());
+}
+
+void Elector::notify_clear_peer_state()
+{
+ dout(10) << __func__ << dendl;
+ dout(20) << " peer_tracker before: " << peer_tracker << dendl;
+ peer_tracker.notify_reset();
+ peer_tracker.set_rank(mon->rank);
+ dout(20) << " peer_tracker after: " << peer_tracker << dendl;
+}
+
+void Elector::notify_rank_changed(int new_rank)
+{
+ dout(10) << __func__ << " to " << new_rank << dendl;
+ peer_tracker.notify_rank_changed(new_rank);
+ live_pinging.erase(new_rank);
+ dead_pinging.erase(new_rank);
+}
+
+void Elector::notify_rank_removed(unsigned rank_removed, unsigned new_rank)
+{
+ dout(10) << __func__ << ": " << rank_removed << dendl;
+ peer_tracker.notify_rank_removed(rank_removed, new_rank);
+ /* we have to clean up the pinging state, which is annoying
+ because it's not indexed anywhere (and adding indexing
+ would also be annoying).
+ In the case where we are removing any rank that is not the
+ higest, we start with the removed rank and examine the state
+ of the surrounding ranks.
+ Everybody who remains with larger rank gets a new rank one lower
+ than before, and we have to figure out the remaining scheduled
+ ping contexts. So, starting one past with the removed rank, we:
+ * check if the current rank is alive or dead
+ * examine our new rank (one less than before, initially the removed
+ rank)
+ * * erase it if it's in the wrong set
+ * * start pinging it if we're not already
+ * check if the next rank is in the same pinging set, and delete
+ * ourselves if not.
+ In the case where we are removing the highest rank,
+ we erase the removed rank from all sets.
+ */
+ if (std::cmp_less(rank_removed, paxos_size())) {
+ for (unsigned i = rank_removed + 1; i <= paxos_size() ; ++i) {
+ if (live_pinging.count(i)) {
+ dead_pinging.erase(i-1);
+ if (!live_pinging.count(i-1)) {
+ begin_peer_ping(i-1);
+ }
+ if (!live_pinging.count(i+1)) {
+ live_pinging.erase(i);
+ }
+ }
+ else if (dead_pinging.count(i)) {
+ live_pinging.erase(i-1);
+ if (!dead_pinging.count(i-1)) {
+ begin_dead_ping(i-1);
+ }
+ if (!dead_pinging.count(i+1)) {
+ dead_pinging.erase(i);
+ }
+ } else {
+ // we aren't pinging rank i at all
+ if (i-1 == (unsigned)rank_removed) {
+ // so we special case to make sure we
+ // actually nuke the removed rank
+ dead_pinging.erase(rank_removed);
+ live_pinging.erase(rank_removed);
+ }
+ }
+ }
+ } else {
+ if (live_pinging.count(rank_removed)) {
+ live_pinging.erase(rank_removed);
+ }
+ if (dead_pinging.count(rank_removed)) {
+ dead_pinging.erase(rank_removed);
+ }
+ }
+}
+
+void Elector::notify_strategy_maybe_changed(int strategy)
+{
+ logic.set_election_strategy(static_cast<ElectionLogic::election_strategy>(strategy));
+}
diff --git a/src/mon/Elector.h b/src/mon/Elector.h
new file mode 100644
index 000000000..be2f91c0f
--- /dev/null
+++ b/src/mon/Elector.h
@@ -0,0 +1,406 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_MON_ELECTOR_H
+#define CEPH_MON_ELECTOR_H
+
+#include <map>
+
+#include "include/types.h"
+#include "include/Context.h"
+#include "mon/MonOpRequest.h"
+#include "mon/mon_types.h"
+#include "mon/ElectionLogic.h"
+#include "mon/ConnectionTracker.h"
+
+class Monitor;
+
+
+/**
+ * This class is responsible for handling messages and maintaining
+ * an ElectionLogic which holds the local state when electing
+ * a new Leader. We may win or we may lose. If we win, it means we became the
+ * Leader; if we lose, it means we are a Peon.
+ */
+class Elector : public ElectionOwner, RankProvider {
+ /**
+ * @defgroup Elector_h_class Elector
+ * @{
+ */
+ ElectionLogic logic;
+ // connectivity validation and scoring
+ ConnectionTracker peer_tracker;
+ std::map<int, utime_t> peer_acked_ping; // rank -> last ping stamp they acked
+ std::map<int, utime_t> peer_sent_ping; // rank -> last ping stamp we sent
+ std::set<int> live_pinging; // ranks which we are currently pinging
+ std::set<int> dead_pinging; // ranks which didn't answer (degrading scores)
+ double ping_timeout; // the timeout after which we consider a ping to be dead
+ int PING_DIVISOR = 2; // we time out pings
+
+ /**
+ * @defgroup Elector_h_internal_types Internal Types
+ * @{
+ */
+ /**
+ * This struct will hold the features from a given peer.
+ * Features may both be the cluster's (in the form of a uint64_t), or
+ * mon-specific features. Instead of keeping maps to hold them both, or
+ * a pair, which would be weird, a struct to keep them seems appropriate.
+ */
+ struct elector_info_t {
+ uint64_t cluster_features = 0;
+ mon_feature_t mon_features;
+ ceph_release_t mon_release{0};
+ std::map<std::string,std::string> metadata;
+ };
+
+ /**
+ * @}
+ */
+
+ /**
+ * The Monitor instance associated with this class.
+ */
+ Monitor *mon;
+
+ /**
+ * Event callback responsible for dealing with an expired election once a
+ * timer runs out and fires up.
+ */
+ Context *expire_event = nullptr;
+
+ /**
+ * Resets the expire_event timer, by cancelling any existing one and
+ * scheduling a new one.
+ *
+ * @remarks This function assumes as a default firing value the duration of
+ * the monitor's lease interval, and adds to it the value specified
+ * in @e plus
+ *
+ * @post expire_event is set
+ *
+ * @param plus The amount of time to be added to the default firing value.
+ */
+ void reset_timer(double plus=0.0);
+ /**
+ * Cancel the expire_event timer, if it is defined.
+ *
+ * @post expire_event is not set
+ */
+ void cancel_timer();
+
+ // electing me
+ /**
+ * @defgroup Elector_h_electing_me_vars We are being elected
+ * @{
+ */
+ /**
+ * Map containing info of all those that acked our proposal to become the Leader.
+ * Note each peer's info.
+ */
+ std::map<int, elector_info_t> peer_info;
+ /**
+ * @}
+ */
+
+ /**
+ * Handle a message from some other node proposing itself to become it
+ * the Leader.
+ *
+ * We validate that the sending Monitor is allowed to participate based on
+ * its supported features, then pass the request to our ElectionLogic.
+ *
+ * @invariant The received message is an operation of type OP_PROPOSE
+ *
+ * @pre Message epoch is from the current or a newer epoch
+ *
+ * @param m A message sent by another participant in the quorum.
+ */
+ void handle_propose(MonOpRequestRef op);
+ /**
+ * Handle a message from some other participant Acking us as the Leader.
+ *
+ * We validate that the sending Monitor is allowed to participate based on
+ * its supported features, add it to peer_info, and pass the ack to our
+ * ElectionLogic.
+ *
+ * @pre Message epoch is from the current or a newer epoch
+ *
+ * @param m A message with an operation type of OP_ACK
+ */
+ void handle_ack(MonOpRequestRef op);
+ /**
+ * Handle a message from some other participant declaring Victory.
+ *
+ * We just got a message from someone declaring themselves Victorious, thus
+ * the new Leader.
+ *
+ * We pass the Victory to our ElectionLogic, and if it confirms the
+ * victory we lose the election and start following this Leader. Otherwise,
+ * drop the message.
+ *
+ * @pre Message epoch is from the current or a newer epoch
+ * @post Election is not on-going
+ * @post Updated @p epoch
+ * @post We have a new quorum if we lost the election
+ *
+ * @param m A message with an operation type of OP_VICTORY
+ */
+ void handle_victory(MonOpRequestRef op);
+ /**
+ * Send a nak to a peer who's out of date, containing information about why.
+ *
+ * If we get a message from a peer who can't support the required quorum
+ * features, we have to ignore them. This function will at least send
+ * them a message about *why* they're being ignored -- if they're new
+ * enough to support such a message.
+ *
+ * @param m A message from a monitor not supporting required features. We
+ * take ownership of the reference.
+ */
+ void nak_old_peer(MonOpRequestRef op);
+ /**
+ * Handle a message from some other participant declaring
+ * we cannot join the quorum.
+ *
+ * Apparently the quorum requires some feature that we do not implement. Shut
+ * down gracefully.
+ *
+ * @pre Election is on-going.
+ * @post We've shut down.
+ *
+ * @param m A message with an operation type of OP_NAK
+ */
+ void handle_nak(MonOpRequestRef op);
+ /**
+ * Send a ping to the specified peer.
+ * @n optional time that we will use instead of calling ceph_clock_now()
+ */
+ bool send_peer_ping(int peer, const utime_t *n=NULL);
+ /**
+ * Check the state of pinging the specified peer. This is our
+ * "tick" for heartbeating; scheduled by itself and begin_peer_ping().
+ */
+ void ping_check(int peer);
+ /**
+ * Move the peer out of live_pinging into dead_pinging set
+ * and schedule dead_ping()ing on it.
+ */
+ void begin_dead_ping(int peer);
+ /**
+ * Checks that the peer is still marked for dead pinging,
+ * and then marks it as dead for the appropriate interval.
+ */
+ void dead_ping(int peer);
+ /**
+ * Handle a ping from another monitor and assimilate the data it contains.
+ */
+ void handle_ping(MonOpRequestRef op);
+ /**
+ * Update our view of everybody else's connectivity based on the provided
+ * tracker bufferlist
+ */
+ void assimilate_connection_reports(const bufferlist& bl);
+
+ public:
+ /**
+ * @defgroup Elector_h_ElectionOwner Functions from the ElectionOwner interface
+ * @{
+ */
+ /* Commit the given epoch to our MonStore.
+ * We also take the opportunity to persist our peer_tracker.
+ */
+ void persist_epoch(epoch_t e);
+ /* Read the epoch out of our MonStore */
+ epoch_t read_persisted_epoch() const;
+ /* Write a nonsense key "election_writeable_test" to our MonStore */
+ void validate_store();
+ /* Reset my tracking. Currently, just call Monitor::join_election() */
+ void notify_bump_epoch();
+ /* Call a new election: Invoke Monitor::start_election() */
+ void trigger_new_election();
+ /* Retrieve rank from the Monitor */
+ int get_my_rank() const;
+ /* Send MMonElection OP_PROPOSE to every monitor in the map. */
+ void propose_to_peers(epoch_t e, bufferlist &bl);
+ /* bootstrap() the Monitor */
+ void reset_election();
+ /* Retrieve the Monitor::has_ever_joined member */
+ bool ever_participated() const;
+ /* Retrieve monmap->size() */
+ unsigned paxos_size() const;
+ /* Right now we don't disallow anybody */
+ std::set<int> disallowed_leaders;
+ const std::set<int>& get_disallowed_leaders() const { return disallowed_leaders; }
+ /**
+ * Reset the expire_event timer so we can limit the amount of time we
+ * will be electing. Clean up our peer_info.
+ *
+ * @post we reset the expire_event timer
+ */
+ void _start();
+ /**
+ * Send an MMonElection message deferring to the identified monitor. We
+ * also increase the election timeout so the monitor we defer to
+ * has some time to gather deferrals and actually win. (FIXME: necessary to protocol?)
+ *
+ * @post we sent an ack message to @p who
+ * @post we reset the expire_event timer
+ *
+ * @param who Some other monitor's numeric identifier.
+ */
+ void _defer_to(int who);
+ /**
+ * Our ElectionLogic told us we won an election! Identify the quorum
+ * features, tell our new peons we've won, and invoke Monitor::win_election().
+ */
+ void message_victory(const std::set<int>& quorum);
+ /* Check if rank is in mon->quorum */
+ bool is_current_member(int rank) const;
+ /*
+ * @}
+ */
+ /**
+ * Persist our peer_tracker to disk.
+ */
+ void persist_connectivity_scores();
+
+ Elector *elector;
+
+ /**
+ * Create an Elector class
+ *
+ * @param m A Monitor instance
+ * @param strategy The election strategy to use, defined in MonMap/ElectionLogic
+ */
+ explicit Elector(Monitor *m, int strategy);
+ virtual ~Elector() {}
+
+ /**
+ * Inform this class it is supposed to shutdown.
+ *
+ * We will simply cancel the @p expire_event if any exists.
+ *
+ * @post @p expire_event is cancelled
+ */
+ void shutdown();
+
+ /**
+ * Obtain our epoch from ElectionLogic.
+ *
+ * @returns Our current epoch number
+ */
+ epoch_t get_epoch() { return logic.get_epoch(); }
+
+ /**
+ * If the Monitor knows there are no Paxos peers (so
+ * we are rank 0 and there are no others) we can declare victory.
+ */
+ void declare_standalone_victory() {
+ logic.declare_standalone_victory();
+ }
+ /**
+ * Tell the Elector to start pinging a given peer.
+ * Do this when you discover a peer and it has a rank assigned.
+ * We do it ourselves on receipt of pings and when receiving other messages.
+ */
+ void begin_peer_ping(int peer);
+ /**
+ * Handle received messages.
+ *
+ * We will ignore all messages that are not of type @p MSG_MON_ELECTION
+ * (i.e., messages whose interface is not of type @p MMonElection). All of
+ * those that are will then be dispatched to their operation-specific
+ * functions.
+ *
+ * @param m A received message
+ */
+ void dispatch(MonOpRequestRef op);
+
+ /**
+ * Call an election.
+ *
+ * This function simply calls ElectionLogic::start.
+ */
+ void call_election() {
+ logic.start();
+ }
+
+ /**
+ * Stop participating in subsequent Elections.
+ *
+ * @post @p participating is false
+ */
+ void stop_participating() { logic.participating = false; }
+ /**
+ * Start participating in Elections.
+ *
+ * If we are already participating (i.e., @p participating is true), then
+ * calling this function is moot.
+ *
+ * However, if we are not participating (i.e., @p participating is false),
+ * then we will start participating by setting @p participating to true and
+ * we will call for an Election.
+ *
+ * @post @p participating is true
+ */
+ void start_participating();
+ /**
+ * Check if our peer_tracker is self-consistent, not suffering from
+ * https://tracker.ceph.com/issues/58049
+ */
+ bool peer_tracker_is_clean();
+ /**
+ * Forget everything about our peers. :(
+ */
+ void notify_clear_peer_state();
+ /**
+ * Notify that our local rank has changed
+ * and we may need to update internal data structures.
+ */
+ void notify_rank_changed(int new_rank);
+ /**
+ * A peer has been removed so we should clean up state related to it.
+ * This is safe to call even if we haven't joined or are currently
+ * in a quorum.
+ */
+ void notify_rank_removed(unsigned rank_removed, unsigned new_rank);
+ void notify_strategy_maybe_changed(int strategy);
+ /**
+ * Set the disallowed leaders.
+ *
+ * If you call this and the new disallowed set
+ * contains your current leader, you are
+ * responsible for calling an election!
+ *
+ * @returns false if the set is unchanged,
+ * true if the set changed
+ */
+ bool set_disallowed_leaders(const std::set<int>& dl) {
+ if (dl == disallowed_leaders) return false;
+ disallowed_leaders = dl;
+ return true;
+ }
+ void dump_connection_scores(Formatter *f) {
+ f->open_object_section("connection scores");
+ peer_tracker.dump(f);
+ f->close_section();
+ }
+ /**
+ * @}
+ */
+};
+
+#endif
diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc
new file mode 100644
index 000000000..2449df5d1
--- /dev/null
+++ b/src/mon/FSCommands.cc
@@ -0,0 +1,1972 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat Ltd
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "OSDMonitor.h"
+
+#include "FSCommands.h"
+#include "MDSMonitor.h"
+#include "MgrStatMonitor.h"
+#include "mds/cephfs_features.h"
+
+using TOPNSPC::common::cmd_getval;
+
+using std::list;
+using std::make_pair;
+using std::pair;
+using std::set;
+using std::string;
+using std::string_view;
+using std::vector;
+using std::ostream;
+
+
+static const auto& APP_NAME_CEPHFS = pg_pool_t::APPLICATION_NAME_CEPHFS;
+
+class FlagSetHandler : public FileSystemCommandHandler
+{
+ public:
+ FlagSetHandler()
+ : FileSystemCommandHandler("fs flag set")
+ {
+ }
+
+ int handle(
+ Monitor *mon,
+ FSMap& fsmap,
+ MonOpRequestRef op,
+ const cmdmap_t& cmdmap,
+ ostream &ss) override
+ {
+ string flag_name;
+ cmd_getval(cmdmap, "flag_name", flag_name);
+
+ string flag_val;
+ cmd_getval(cmdmap, "val", flag_val);
+
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+
+ if (flag_name == "enable_multiple") {
+ bool flag_bool = false;
+ int r = parse_bool(flag_val, &flag_bool, ss);
+ if (r != 0) {
+ ss << "Invalid boolean value '" << flag_val << "'";
+ return r;
+ }
+
+ fsmap.set_enable_multiple(flag_bool);
+ return 0;
+ } else {
+ ss << "Unknown flag '" << flag_name << "'";
+ return -EINVAL;
+ }
+ }
+};
+
+class FailHandler : public FileSystemCommandHandler
+{
+ public:
+ FailHandler()
+ : FileSystemCommandHandler("fs fail")
+ {
+ }
+
+ int handle(
+ Monitor* mon,
+ FSMap& fsmap,
+ MonOpRequestRef op,
+ const cmdmap_t& cmdmap,
+ ostream& ss) override
+ {
+ if (!mon->osdmon()->is_writeable()) {
+ // not allowed to write yet, so retry when we can
+ mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op));
+ return -EAGAIN;
+ }
+
+ string fs_name;
+ if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+ ss << "Missing filesystem name";
+ return -EINVAL;
+ }
+
+ auto* fsp = fsmap.get_filesystem(fs_name);
+ if (fsp == nullptr) {
+ ss << "Not found: '" << fs_name << "'";
+ return -ENOENT;
+ }
+
+ bool confirm = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
+ if (!confirm &&
+ mon->mdsmon()->has_health_warnings({
+ MDS_HEALTH_TRIM, MDS_HEALTH_CACHE_OVERSIZED})) {
+ ss << errmsg_for_unhealthy_mds;
+ return -EPERM;
+ }
+
+ auto f = [](auto&& fs) {
+ fs.get_mds_map().set_flag(CEPH_MDSMAP_NOT_JOINABLE);
+ };
+ fsmap.modify_filesystem(fsp->get_fscid(), std::move(f));
+
+ vector<mds_gid_t> to_fail;
+ for (const auto& p : fsp->get_mds_map().get_mds_info()) {
+ to_fail.push_back(p.first);
+ }
+
+ for (const auto& gid : to_fail) {
+ mon->mdsmon()->fail_mds_gid(fsmap, gid);
+ }
+ if (!to_fail.empty()) {
+ mon->osdmon()->propose_pending();
+ }
+
+ ss << fs_name;
+ ss << " marked not joinable; MDS cannot join the cluster. All MDS ranks marked failed.";
+
+ return 0;
+ }
+};
+
+class FsNewHandler : public FileSystemCommandHandler
+{
+ public:
+ explicit FsNewHandler(Paxos *paxos)
+ : FileSystemCommandHandler("fs new"), m_paxos(paxos)
+ {
+ }
+
+ int handle(
+ Monitor *mon,
+ FSMap& fsmap,
+ MonOpRequestRef op,
+ const cmdmap_t& cmdmap,
+ ostream &ss) override
+ {
+ ceph_assert(m_paxos->is_plugged());
+
+ string metadata_name;
+ cmd_getval(cmdmap, "metadata", metadata_name);
+ int64_t metadata = mon->osdmon()->osdmap.lookup_pg_pool_name(metadata_name);
+ if (metadata < 0) {
+ ss << "pool '" << metadata_name << "' does not exist";
+ return -ENOENT;
+ }
+
+ string data_name;
+ cmd_getval(cmdmap, "data", data_name);
+ int64_t data = mon->osdmon()->osdmap.lookup_pg_pool_name(data_name);
+ if (data < 0) {
+ ss << "pool '" << data_name << "' does not exist";
+ return -ENOENT;
+ }
+ if (data == 0) {
+ ss << "pool '" << data_name << "' has id 0, which CephFS does not allow. Use another pool or recreate it to get a non-zero pool id.";
+ return -EINVAL;
+ }
+
+ string fs_name;
+ cmd_getval(cmdmap, "fs_name", fs_name);
+ if (fs_name.empty()) {
+ // Ensure fs name is not empty so that we can implement
+ // commmands that refer to FS by name in future.
+ ss << "Filesystem name may not be empty";
+ return -EINVAL;
+ }
+
+ if (auto* fsp = fsmap.get_filesystem(fs_name); fsp) {
+ if (*(fsp->get_mds_map().get_data_pools().begin()) == data
+ && fsp->get_mds_map().get_metadata_pool() == metadata) {
+ // Identical FS created already, this is a no-op
+ ss << "filesystem '" << fs_name << "' already exists";
+ return 0;
+ } else {
+ ss << "filesystem already exists with name '" << fs_name << "'";
+ return -EINVAL;
+ }
+ }
+
+ bool force = false;
+ cmd_getval(cmdmap, "force", force);
+
+ const pool_stat_t *stat = mon->mgrstatmon()->get_pool_stat(metadata);
+ if (stat) {
+ int64_t metadata_num_objects = stat->stats.sum.num_objects;
+ if (!force && metadata_num_objects > 0) {
+ ss << "pool '" << metadata_name
+ << "' already contains some objects. Use an empty pool instead.";
+ return -EINVAL;
+ }
+ }
+
+ if (fsmap.filesystem_count() > 0
+ && !fsmap.get_enable_multiple()) {
+ ss << "Creation of multiple filesystems is disabled. To enable "
+ "this experimental feature, use 'ceph fs flag set enable_multiple "
+ "true'";
+ return -EINVAL;
+ }
+
+ bool allow_overlay = false;
+ cmd_getval(cmdmap, "allow_dangerous_metadata_overlay", allow_overlay);
+
+ for (const auto& [fscid, fs] : std::as_const(fsmap)) {
+ const vector<int64_t> &data_pools = fs.get_mds_map().get_data_pools();
+ if ((std::find(data_pools.begin(), data_pools.end(), data) != data_pools.end()
+ || fs.get_mds_map().get_metadata_pool() == metadata)
+ && !allow_overlay) {
+ ss << "Filesystem '" << fs_name
+ << "' is already using one of the specified RADOS pools. This should ONLY be done in emergencies and after careful reading of the documentation. Pass --allow-dangerous-metadata-overlay to permit this.";
+ return -EINVAL;
+ }
+ }
+
+ int64_t fscid = FS_CLUSTER_ID_NONE;
+ if (cmd_getval(cmdmap, "fscid", fscid)) {
+ if (!force) {
+ ss << "Pass --force to create a file system with a specific ID";
+ return -EINVAL;
+ }
+ if (fsmap.filesystem_exists(fscid)) {
+ ss << "filesystem already exists with id '" << fscid << "'";
+ return -EINVAL;
+ }
+ }
+
+ pg_pool_t const *data_pool = mon->osdmon()->osdmap.get_pg_pool(data);
+ ceph_assert(data_pool != NULL); // Checked it existed above
+ pg_pool_t const *metadata_pool = mon->osdmon()->osdmap.get_pg_pool(metadata);
+ ceph_assert(metadata_pool != NULL); // Checked it existed above
+
+ int r = _check_pool(mon->osdmon()->osdmap, data, POOL_DATA_DEFAULT, force, &ss, allow_overlay);
+ if (r < 0) {
+ return r;
+ }
+
+ r = _check_pool(mon->osdmon()->osdmap, metadata, POOL_METADATA, force, &ss, allow_overlay);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!mon->osdmon()->is_writeable()) {
+ // not allowed to write yet, so retry when we can
+ mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op));
+ return -EAGAIN;
+ }
+ mon->osdmon()->do_application_enable(data, APP_NAME_CEPHFS, "data",
+ fs_name, true);
+ mon->osdmon()->do_application_enable(metadata, APP_NAME_CEPHFS,
+ "metadata", fs_name, true);
+ mon->osdmon()->do_set_pool_opt(metadata,
+ pool_opts_t::RECOVERY_PRIORITY,
+ static_cast<int64_t>(5));
+ mon->osdmon()->do_set_pool_opt(metadata,
+ pool_opts_t::PG_NUM_MIN,
+ static_cast<int64_t>(16));
+ mon->osdmon()->do_set_pool_opt(metadata,
+ pool_opts_t::PG_AUTOSCALE_BIAS,
+ static_cast<double>(4.0));
+ mon->osdmon()->propose_pending();
+
+ bool recover = false;
+ cmd_getval(cmdmap, "recover", recover);
+
+ // All checks passed, go ahead and create.
+ auto&& fs = fsmap.create_filesystem(fs_name, metadata, data,
+ mon->get_quorum_con_features(), fscid, recover);
+
+ ss << "new fs with metadata pool " << metadata << " and data pool " << data;
+
+ if (recover) {
+ return 0;
+ }
+
+ // assign a standby to rank 0 to avoid health warnings
+ auto info = fsmap.find_replacement_for({fs.get_fscid(), 0});
+
+ if (info) {
+ mon->clog->info() << info->human_name() << " assigned to filesystem "
+ << fs_name << " as rank 0";
+ fsmap.promote(info->global_id, fs.get_fscid(), 0);
+ }
+
+ return 0;
+ }
+
+private:
+ Paxos *m_paxos;
+};
+
+class SetHandler : public FileSystemCommandHandler
+{
+public:
+ SetHandler()
+ : FileSystemCommandHandler("fs set")
+ {}
+
+ int handle(
+ Monitor *mon,
+ FSMap& fsmap,
+ MonOpRequestRef op,
+ const cmdmap_t& cmdmap,
+ ostream &ss) override
+ {
+ string fs_name;
+ if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+ ss << "Missing filesystem name";
+ return -EINVAL;
+ }
+
+ auto* fsp = fsmap.get_filesystem(fs_name);
+ string var;
+ if (!cmd_getval(cmdmap, "var", var) || var.empty()) {
+ ss << "Invalid variable";
+ return -EINVAL;
+ }
+ string val;
+ string interr;
+ int64_t n = 0;
+ if (!cmd_getval(cmdmap, "val", val)) {
+ return -EINVAL;
+ }
+ // we got a string. see if it contains an int.
+ n = strict_strtoll(val.c_str(), 10, &interr);
+ if (var == "max_mds") {
+ // NOTE: see also "mds set_max_mds", which can modify the same field.
+ if (interr.length()) {
+ ss << interr;
+ return -EINVAL;
+ }
+
+ if (n <= 0) {
+ ss << "You must specify at least one MDS";
+ return -EINVAL;
+ }
+
+ if (n > 1 && n > fsp->get_mds_map().get_max_mds()) {
+ if (fsp->get_mds_map().was_snaps_ever_allowed() &&
+ !fsp->get_mds_map().allows_multimds_snaps()) {
+ ss << "multi-active MDS is not allowed while there are snapshots possibly created by pre-mimic MDS";
+ return -EINVAL;
+ }
+ }
+ if (n > MAX_MDS) {
+ ss << "may not have more than " << MAX_MDS << " MDS ranks";
+ return -EINVAL;
+ }
+
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [n](auto&& fs)
+ {
+ fs.get_mds_map().clear_flag(CEPH_MDSMAP_NOT_JOINABLE);
+ fs.get_mds_map().set_max_mds(n);
+ });
+ } else if (var == "inline_data") {
+ bool enable_inline = false;
+ int r = parse_bool(val, &enable_inline, ss);
+ if (r != 0) {
+ return r;
+ }
+
+ if (enable_inline) {
+ bool confirm = false;
+ cmd_getval(cmdmap, "yes_i_really_really_mean_it", confirm);
+ if (!confirm) {
+ ss << "Inline data support is deprecated and will be removed in a future release. "
+ << "Add --yes-i-really-really-mean-it if you are certain you want this enabled.";
+ return -EPERM;
+ }
+ ss << "inline data enabled";
+
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [](auto&& fs)
+ {
+ fs.get_mds_map().set_inline_data_enabled(true);
+ });
+ } else {
+ ss << "inline data disabled";
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [](auto&& fs)
+ {
+ fs.get_mds_map().set_inline_data_enabled(false);
+ });
+ }
+ } else if (var == "balancer") {
+ if (val.empty()) {
+ ss << "unsetting the metadata load balancer";
+ } else {
+ ss << "setting the metadata load balancer to " << val;
+ }
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [val](auto&& fs)
+ {
+ fs.get_mds_map().set_balancer(val);
+ });
+ return true;
+ } else if (var == "bal_rank_mask") {
+ if (val.empty()) {
+ ss << "bal_rank_mask may not be empty";
+ return -EINVAL;
+ }
+
+ if (fsp->get_mds_map().check_special_bal_rank_mask(val, MDSMap::BAL_RANK_MASK_TYPE_ANY) == false) {
+ string bin_string;
+ int r = fsp->get_mds_map().hex2bin(val, bin_string, MAX_MDS, ss);
+ if (r != 0) {
+ return r;
+ }
+ }
+ ss << "setting the metadata balancer rank mask to " << val;
+
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [val](auto&& fs)
+ {
+ fs.get_mds_map().set_bal_rank_mask(val);
+ });
+ return true;
+ } else if (var == "max_file_size") {
+ if (interr.length()) {
+ ss << var << " requires an integer value";
+ return -EINVAL;
+ }
+ if (n < CEPH_MIN_STRIPE_UNIT) {
+ ss << var << " must at least " << CEPH_MIN_STRIPE_UNIT;
+ return -ERANGE;
+ }
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [n](auto&& fs)
+ {
+ fs.get_mds_map().set_max_filesize(n);
+ });
+ } else if (var == "max_xattr_size") {
+ if (interr.length()) {
+ ss << var << " requires an integer value";
+ return -EINVAL;
+ }
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [n](auto&& fs)
+ {
+ fs.get_mds_map().set_max_xattr_size(n);
+ });
+ } else if (var == "allow_new_snaps") {
+ bool enable_snaps = false;
+ int r = parse_bool(val, &enable_snaps, ss);
+ if (r != 0) {
+ return r;
+ }
+
+ if (!enable_snaps) {
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [](auto&& fs)
+ {
+ fs.get_mds_map().clear_snaps_allowed();
+ });
+ ss << "disabled new snapshots";
+ } else {
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [](auto&& fs)
+ {
+ fs.get_mds_map().set_snaps_allowed();
+ });
+ ss << "enabled new snapshots";
+ }
+ } else if (var == "allow_multimds") {
+ ss << "Multiple MDS is always enabled. Use the max_mds"
+ << " parameter to control the number of active MDSs"
+ << " allowed. This command is DEPRECATED and will be"
+ << " REMOVED from future releases.";
+ } else if (var == "allow_multimds_snaps") {
+ bool enable = false;
+ int r = parse_bool(val, &enable, ss);
+ if (r != 0) {
+ return r;
+ }
+
+ string confirm;
+ if (!cmd_getval(cmdmap, "confirm", confirm) ||
+ confirm != "--yes-i-am-really-a-mds") {
+ ss << "Warning! This command is for MDS only. Do not run it manually";
+ return -EPERM;
+ }
+
+ if (enable) {
+ ss << "enabled multimds with snapshot";
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [](auto&& fs)
+ {
+ fs.get_mds_map().set_multimds_snaps_allowed();
+ });
+ } else {
+ ss << "disabled multimds with snapshot";
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [](auto&& fs)
+ {
+ fs.get_mds_map().clear_multimds_snaps_allowed();
+ });
+ }
+ } else if (var == "allow_dirfrags") {
+ ss << "Directory fragmentation is now permanently enabled."
+ << " This command is DEPRECATED and will be REMOVED from future releases.";
+ } else if (var == "down") {
+ bool is_down = false;
+ int r = parse_bool(val, &is_down, ss);
+ if (r != 0) {
+ return r;
+ }
+
+ ss << fsp->get_mds_map().get_fs_name();
+
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [is_down](auto&& fs)
+ {
+ if (is_down) {
+ if (fs.get_mds_map().get_max_mds() > 0) {
+ fs.get_mds_map().set_old_max_mds();
+ fs.get_mds_map().set_max_mds(0);
+ } /* else already down! */
+ } else {
+ mds_rank_t oldmax = fs.get_mds_map().get_old_max_mds();
+ fs.get_mds_map().set_max_mds(oldmax ? oldmax : 1);
+ }
+ });
+
+ if (is_down) {
+ ss << " marked down. ";
+ } else {
+ ss << " marked up, max_mds = " << fsp->get_mds_map().get_max_mds();
+ }
+ } else if (var == "cluster_down" || var == "joinable") {
+ bool joinable = true;
+ int r = parse_bool(val, &joinable, ss);
+ if (r != 0) {
+ return r;
+ }
+ if (var == "cluster_down") {
+ joinable = !joinable;
+ }
+
+ ss << fsp->get_mds_map().get_fs_name();
+
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [joinable](auto&& fs)
+ {
+ if (joinable) {
+ fs.get_mds_map().clear_flag(CEPH_MDSMAP_NOT_JOINABLE);
+ } else {
+ fs.get_mds_map().set_flag(CEPH_MDSMAP_NOT_JOINABLE);
+ }
+ });
+
+ if (joinable) {
+ ss << " marked joinable; MDS may join as newly active.";
+ } else {
+ ss << " marked not joinable; MDS cannot join as newly active.";
+ }
+
+ if (var == "cluster_down") {
+ ss << " WARNING: cluster_down flag is deprecated and will be"
+ << " removed in a future version. Please use \"joinable\".";
+ }
+ } else if (var == "standby_count_wanted") {
+ if (interr.length()) {
+ ss << var << " requires an integer value";
+ return -EINVAL;
+ }
+ if (n < 0) {
+ ss << var << " must be non-negative";
+ return -ERANGE;
+ }
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [n](auto&& fs)
+ {
+ fs.get_mds_map().set_standby_count_wanted(n);
+ });
+ } else if (var == "session_timeout") {
+ if (interr.length()) {
+ ss << var << " requires an integer value";
+ return -EINVAL;
+ }
+ if (n < 30) {
+ ss << var << " must be at least 30s";
+ return -ERANGE;
+ }
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [n](auto&& fs)
+ {
+ fs.get_mds_map().set_session_timeout((uint32_t)n);
+ });
+ } else if (var == "session_autoclose") {
+ if (interr.length()) {
+ ss << var << " requires an integer value";
+ return -EINVAL;
+ }
+ if (n < 30) {
+ ss << var << " must be at least 30s";
+ return -ERANGE;
+ }
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [n](auto&& fs)
+ {
+ fs.get_mds_map().set_session_autoclose((uint32_t)n);
+ });
+ } else if (var == "allow_standby_replay") {
+ bool allow = false;
+ int r = parse_bool(val, &allow, ss);
+ if (r != 0) {
+ return r;
+ }
+
+ if (!allow) {
+ if (!mon->osdmon()->is_writeable()) {
+ // not allowed to write yet, so retry when we can
+ mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op));
+ return -EAGAIN;
+ }
+ vector<mds_gid_t> to_fail;
+ for (const auto& [gid, info]: fsp->get_mds_map().get_mds_info()) {
+ if (info.state == MDSMap::STATE_STANDBY_REPLAY) {
+ to_fail.push_back(gid);
+ }
+ }
+
+ for (const auto& gid : to_fail) {
+ mon->mdsmon()->fail_mds_gid(fsmap, gid);
+ }
+ if (!to_fail.empty()) {
+ mon->osdmon()->propose_pending();
+ }
+ }
+
+ auto f = [allow](auto&& fs) {
+ if (allow) {
+ fs.get_mds_map().set_standby_replay_allowed();
+ } else {
+ fs.get_mds_map().clear_standby_replay_allowed();
+ }
+ };
+ fsmap.modify_filesystem(fsp->get_fscid(), std::move(f));
+ } else if (var == "balance_automate") {
+ bool allow = false;
+ int r = parse_bool(val, &allow, ss);
+ if (r != 0) {
+ return r;
+ }
+
+ auto f = [allow](auto&& fs) {
+ if (allow) {
+ fs.get_mds_map().set_balance_automate();
+ } else {
+ fs.get_mds_map().clear_balance_automate();
+ }
+ };
+ fsmap.modify_filesystem(fsp->get_fscid(), std::move(f));
+ } else if (var == "min_compat_client") {
+ auto vno = ceph_release_from_name(val.c_str());
+ if (!vno) {
+ ss << "version " << val << " is not recognized";
+ return -EINVAL;
+ }
+ ss << "WARNING: setting min_compat_client is deprecated"
+ " and may not do what you want.\n"
+ "The oldest release to set is octopus.\n"
+ "Please migrate to `ceph fs required_client_features ...`.";
+ auto f = [vno](auto&& fs) {
+ fs.get_mds_map().set_min_compat_client(vno);
+ };
+ fsmap.modify_filesystem(fsp->get_fscid(), std::move(f));
+ } else if (var == "refuse_client_session") {
+ bool refuse_session = false;
+ int r = parse_bool(val, &refuse_session, ss);
+ if (r != 0) {
+ return r;
+ }
+
+ if (refuse_session) {
+ if (!(fsp->get_mds_map().test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION))) {
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [](auto&& fs)
+ {
+ fs.get_mds_map().set_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION);
+ });
+ ss << "client(s) blocked from establishing new session(s)";
+ } else {
+ ss << "client(s) already blocked from establishing new session(s)";
+ }
+ } else {
+ if (fsp->get_mds_map().test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) {
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [](auto&& fs)
+ {
+ fs.get_mds_map().clear_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION);
+ });
+ ss << "client(s) allowed to establish new session(s)";
+ } else {
+ ss << "client(s) already allowed to establish new session(s)";
+ }
+ }
+ } else if (var == "refuse_standby_for_another_fs") {
+ bool refuse_standby_for_another_fs = false;
+ int r = parse_bool(val, &refuse_standby_for_another_fs, ss);
+ if (r != 0) {
+ return r;
+ }
+
+ if (refuse_standby_for_another_fs) {
+ if (!(fsp->get_mds_map().test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS))) {
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [](auto&& fs)
+ {
+ fs.get_mds_map().set_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS);
+ });
+ ss << "set to refuse standby for another fs";
+ } else {
+ ss << "to refuse standby for another fs is already set";
+ }
+ } else {
+ if (fsp->get_mds_map().test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS)) {
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [](auto&& fs)
+ {
+ fs.get_mds_map().clear_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS);
+ });
+ ss << "allowed to use standby for another fs";
+ } else {
+ ss << "to use standby for another fs is already allowed";
+ }
+ }
+ } else {
+ ss << "unknown variable " << var;
+ return -EINVAL;
+ }
+
+ return 0;
+ }
+};
+
+class CompatSetHandler : public FileSystemCommandHandler
+{
+ public:
+ CompatSetHandler()
+ : FileSystemCommandHandler("fs compat")
+ {
+ }
+
+ int handle(
+ Monitor *mon,
+ FSMap &fsmap,
+ MonOpRequestRef op,
+ const cmdmap_t& cmdmap,
+ ostream &ss) override
+ {
+ static const set<string> subops = {"rm_incompat", "rm_compat", "add_incompat", "add_compat"};
+
+ string fs_name;
+ if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+ ss << "Missing filesystem name";
+ return -EINVAL;
+ }
+ auto* fsp = fsmap.get_filesystem(fs_name);
+ if (fsp == nullptr) {
+ ss << "Not found: '" << fs_name << "'";
+ return -ENOENT;
+ }
+
+ string subop;
+ if (!cmd_getval(cmdmap, "subop", subop) || subops.count(subop) == 0) {
+ ss << "subop `" << subop << "' not recognized. Must be one of: " << subops;
+ return -EINVAL;
+ }
+
+ int64_t feature;
+ if (!cmd_getval(cmdmap, "feature", feature) || feature <= 0) {
+ ss << "Invalid feature";
+ return -EINVAL;
+ }
+
+ if (fsp->get_mds_map().get_num_up_mds() > 0) {
+ ss << "file system must be failed or down; use `ceph fs fail` to bring down";
+ return -EBUSY;
+ }
+
+ CompatSet cs = fsp->get_mds_map().compat;
+ if (subop == "rm_compat") {
+ if (cs.compat.contains(feature)) {
+ ss << "removed compat feature " << feature;
+ cs.compat.remove(feature);
+ } else {
+ ss << "already removed compat feature " << feature;
+ }
+ } else if (subop == "rm_incompat") {
+ if (cs.incompat.contains(feature)) {
+ ss << "removed incompat feature " << feature;
+ cs.incompat.remove(feature);
+ } else {
+ ss << "already removed incompat feature " << feature;
+ }
+ } else if (subop == "add_compat" || subop == "add_incompat") {
+ string feature_str;
+ if (!cmd_getval(cmdmap, "feature_str", feature_str) || feature_str.empty()) {
+ ss << "adding a feature requires a feature string";
+ return -EINVAL;
+ }
+ auto f = CompatSet::Feature(feature, feature_str);
+ if (subop == "add_compat") {
+ if (cs.compat.contains(feature)) {
+ auto name = cs.compat.get_name(feature);
+ if (name == feature_str) {
+ ss << "feature already exists";
+ } else {
+ ss << "feature with differing name `" << name << "' exists";
+ return -EEXIST;
+ }
+ } else {
+ cs.compat.insert(f);
+ ss << "added compat feature " << f;
+ }
+ } else if (subop == "add_incompat") {
+ if (cs.incompat.contains(feature)) {
+ auto name = cs.incompat.get_name(feature);
+ if (name == feature_str) {
+ ss << "feature already exists";
+ } else {
+ ss << "feature with differing name `" << name << "' exists";
+ return -EEXIST;
+ }
+ } else {
+ cs.incompat.insert(f);
+ ss << "added incompat feature " << f;
+ }
+ } else ceph_assert(0);
+ } else ceph_assert(0);
+
+ auto modifyf = [cs = std::move(cs)](auto&& fs) {
+ fs.get_mds_map().compat = cs;
+ };
+
+ fsmap.modify_filesystem(fsp->get_fscid(), std::move(modifyf));
+ return 0;
+ }
+};
+
+class RequiredClientFeaturesHandler : public FileSystemCommandHandler
+{
+ public:
+ RequiredClientFeaturesHandler()
+ : FileSystemCommandHandler("fs required_client_features")
+ {
+ }
+
+ int handle(
+ Monitor *mon,
+ FSMap &fsmap,
+ MonOpRequestRef op,
+ const cmdmap_t& cmdmap,
+ ostream &ss) override
+ {
+ string fs_name;
+ if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+ ss << "Missing filesystem name";
+ return -EINVAL;
+ }
+ auto* fsp = fsmap.get_filesystem(fs_name);
+ if (fsp == nullptr) {
+ ss << "Not found: '" << fs_name << "'";
+ return -ENOENT;
+ }
+ string subop;
+ if (!cmd_getval(cmdmap, "subop", subop) ||
+ (subop != "add" && subop != "rm")) {
+ ss << "Must either add or rm a feature; " << subop << " is not recognized";
+ return -EINVAL;
+ }
+ string val;
+ if (!cmd_getval(cmdmap, "val", val) || val.empty()) {
+ ss << "Missing feature id/name";
+ return -EINVAL;
+ }
+
+ int feature = cephfs_feature_from_name(val);
+ if (feature < 0) {
+ string err;
+ feature = strict_strtol(val.c_str(), 10, &err);
+ if (err.length()) {
+ ss << "Invalid feature name: " << val;
+ return -EINVAL;
+ }
+ if (feature < 0 || feature > CEPHFS_FEATURE_MAX) {
+ ss << "Invalid feature id: " << feature;
+ return -EINVAL;
+ }
+ }
+
+ if (subop == "add") {
+ bool ret = false;
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [feature, &ret](auto&& fs)
+ {
+ if (fs.get_mds_map().get_required_client_features().test(feature))
+ return;
+ fs.get_mds_map().add_required_client_feature(feature);
+ ret = true;
+ });
+ if (ret) {
+ ss << "added feature '" << cephfs_feature_name(feature) << "' to required_client_features";
+ } else {
+ ss << "feature '" << cephfs_feature_name(feature) << "' is already set";
+ }
+ } else {
+ bool ret = false;
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [feature, &ret](auto&& fs)
+ {
+ if (!fs.get_mds_map().get_required_client_features().test(feature))
+ return;
+ fs.get_mds_map().remove_required_client_feature(feature);
+ ret = true;
+ });
+ if (ret) {
+ ss << "removed feature '" << cephfs_feature_name(feature) << "' from required_client_features";
+ } else {
+ ss << "feature '" << cephfs_feature_name(feature) << "' is already unset";
+ }
+ }
+ return 0;
+ }
+};
+
+
+class AddDataPoolHandler : public FileSystemCommandHandler
+{
+ public:
+ explicit AddDataPoolHandler(Paxos *paxos)
+ : FileSystemCommandHandler("fs add_data_pool"), m_paxos(paxos)
+ {}
+
+ int handle(
+ Monitor *mon,
+ FSMap& fsmap,
+ MonOpRequestRef op,
+ const cmdmap_t& cmdmap,
+ ostream &ss) override
+ {
+ ceph_assert(m_paxos->is_plugged());
+
+ string poolname;
+ cmd_getval(cmdmap, "pool", poolname);
+
+ string fs_name;
+ if (!cmd_getval(cmdmap, "fs_name", fs_name)
+ || fs_name.empty()) {
+ ss << "Missing filesystem name";
+ return -EINVAL;
+ }
+
+ int64_t poolid = mon->osdmon()->osdmap.lookup_pg_pool_name(poolname);
+ if (poolid < 0) {
+ string err;
+ poolid = strict_strtol(poolname.c_str(), 10, &err);
+ if (err.length()) {
+ ss << "pool '" << poolname << "' does not exist";
+ return -ENOENT;
+ }
+ }
+
+ int r = _check_pool(mon->osdmon()->osdmap, poolid, POOL_DATA_EXTRA, false, &ss);
+ if (r != 0) {
+ return r;
+ }
+
+ auto* fsp = fsmap.get_filesystem(fs_name);
+ if (fsp == nullptr) {
+ ss << "filesystem '" << fs_name << "' does not exist";
+ return -ENOENT;
+ }
+
+ // no-op when the data_pool already on fs
+ if (fsp->get_mds_map().is_data_pool(poolid)) {
+ ss << "data pool " << poolid << " is already on fs " << fs_name;
+ return 0;
+ }
+
+ if (!mon->osdmon()->is_writeable()) {
+ // not allowed to write yet, so retry when we can
+ mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op));
+ return -EAGAIN;
+ }
+ mon->osdmon()->do_application_enable(poolid, APP_NAME_CEPHFS, "data",
+ fs_name, true);
+ mon->osdmon()->propose_pending();
+
+ fsmap.modify_filesystem(
+ fsp->get_fscid(),
+ [poolid](auto&& fs)
+ {
+ fs.get_mds_map().add_data_pool(poolid);
+ });
+
+ ss << "added data pool " << poolid << " to fsmap";
+
+ return 0;
+ }
+
+private:
+ Paxos *m_paxos;
+};
+
+class SetDefaultHandler : public FileSystemCommandHandler
+{
+ public:
+ SetDefaultHandler()
+ : FileSystemCommandHandler("fs set-default")
+ {}
+
+ int handle(
+ Monitor *mon,
+ FSMap& fsmap,
+ MonOpRequestRef op,
+ const cmdmap_t& cmdmap,
+ ostream &ss) override
+ {
+ string fs_name;
+ cmd_getval(cmdmap, "fs_name", fs_name);
+ auto* fsp = fsmap.get_filesystem(fs_name);
+ if (fsp == nullptr) {
+ ss << "filesystem '" << fs_name << "' does not exist";
+ return -ENOENT;
+ }
+
+ fsmap.set_legacy_client_fscid(fsp->get_fscid());
+ return 0;
+ }
+};
+
+class RemoveFilesystemHandler : public FileSystemCommandHandler
+{
+ public:
+ RemoveFilesystemHandler()
+ : FileSystemCommandHandler("fs rm")
+ {}
+
+ int handle(
+ Monitor *mon,
+ FSMap& fsmap,
+ MonOpRequestRef op,
+ const cmdmap_t& cmdmap,
+ ostream &ss) override
+ {
+ /* We may need to blocklist ranks. */
+ if (!mon->osdmon()->is_writeable()) {
+ // not allowed to write yet, so retry when we can
+ mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op));
+ return -EAGAIN;
+ }
+
+ // Check caller has correctly named the FS to delete
+ // (redundant while there is only one FS, but command
+ // syntax should apply to multi-FS future)
+ string fs_name;
+ cmd_getval(cmdmap, "fs_name", fs_name);
+ auto* fsp = fsmap.get_filesystem(fs_name);
+ if (fsp == nullptr) {
+ // Consider absence success to make deletes idempotent
+ ss << "filesystem '" << fs_name << "' does not exist";
+ return 0;
+ }
+
+ // Check that no MDS daemons are active
+ if (fsp->get_mds_map().get_num_up_mds() > 0) {
+ ss << "all MDS daemons must be inactive/failed before removing filesystem. See `ceph fs fail`.";
+ return -EINVAL;
+ }
+
+ // Check for confirmation flag
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) {
+ ss << "this is a DESTRUCTIVE operation and will make data in your filesystem permanently" \
+ " inaccessible. Add --yes-i-really-mean-it if you are sure you wish to continue.";
+ return -EPERM;
+ }
+
+ if (fsmap.get_legacy_client_fscid() == fsp->get_fscid()) {
+ fsmap.set_legacy_client_fscid(FS_CLUSTER_ID_NONE);
+ }
+
+ vector<mds_gid_t> to_fail;
+ // There may be standby_replay daemons left here
+ for (const auto &i : fsp->get_mds_map().get_mds_info()) {
+ ceph_assert(i.second.state == MDSMap::STATE_STANDBY_REPLAY);
+ to_fail.push_back(i.first);
+ }
+
+ for (const auto &gid : to_fail) {
+ // Standby replays don't write, so it isn't important to
+ // wait for an osdmap propose here: ignore return value.
+ mon->mdsmon()->fail_mds_gid(fsmap, gid);
+ }
+ if (!to_fail.empty()) {
+ mon->osdmon()->propose_pending(); /* maybe new blocklists */
+ }
+
+ fsmap.erase_filesystem(fsp->get_fscid());
+
+ return 0;
+ }
+};
+
+class ResetFilesystemHandler : public FileSystemCommandHandler
+{
+ public:
+ ResetFilesystemHandler()
+ : FileSystemCommandHandler("fs reset")
+ {}
+
+ int handle(
+ Monitor *mon,
+ FSMap& fsmap,
+ MonOpRequestRef op,
+ const cmdmap_t& cmdmap,
+ ostream &ss) override
+ {
+ string fs_name;
+ cmd_getval(cmdmap, "fs_name", fs_name);
+ auto* fsp = fsmap.get_filesystem(fs_name);
+ if (fsp == nullptr) {
+ ss << "filesystem '" << fs_name << "' does not exist";
+ // Unlike fs rm, we consider this case an error
+ return -ENOENT;
+ }
+
+ // Check that no MDS daemons are active
+ if (fsp->get_mds_map().get_num_up_mds() > 0) {
+ ss << "all MDS daemons must be inactive before resetting filesystem: set the cluster_down flag"
+ " and use `ceph mds fail` to make this so";
+ return -EINVAL;
+ }
+
+ // Check for confirmation flag
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) {
+ ss << "this is a potentially destructive operation, only for use by experts in disaster recovery. "
+ "Add --yes-i-really-mean-it if you are sure you wish to continue.";
+ return -EPERM;
+ }
+
+ fsmap.reset_filesystem(fsp->get_fscid());
+
+ return 0;
+ }
+};
+
+class RenameFilesystemHandler : public FileSystemCommandHandler
+{
+ public:
+ explicit RenameFilesystemHandler(Paxos *paxos)
+ : FileSystemCommandHandler("fs rename"), m_paxos(paxos)
+ {
+ }
+
+ int handle(
+ Monitor *mon,
+ FSMap& fsmap,
+ MonOpRequestRef op,
+ const cmdmap_t& cmdmap,
+ ostream &ss) override
+ {
+ ceph_assert(m_paxos->is_plugged());
+
+ string fs_name;
+ cmd_getval(cmdmap, "fs_name", fs_name);
+ auto* fsp = fsmap.get_filesystem(fs_name);
+
+ string new_fs_name;
+ cmd_getval(cmdmap, "new_fs_name", new_fs_name);
+ auto* new_fsp = fsmap.get_filesystem(new_fs_name);
+
+ if (fsp == nullptr) {
+ if (new_fsp) {
+ // make 'fs rename' idempotent
+ ss << "File system may already have been renamed. Desired file system '"
+ << new_fs_name << "' exists.";
+ return 0;
+ } else {
+ ss << "File system '" << fs_name << "' does not exist";
+ return -ENOENT;
+ }
+ }
+
+ if (new_fsp) {
+ ss << "Desired file system name '" << new_fs_name << "' already in use";
+ return -EINVAL;
+ }
+
+ if (fsp->get_mirror_info().mirrored) {
+ ss << "Mirroring is enabled on file system '"<< fs_name << "'. Disable mirroring on the "
+ "file system after ensuring it's OK to do so, and then retry to rename.";
+ return -EPERM;
+ }
+
+ // Check for confirmation flag
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) {
+ ss << "this is a potentially disruptive operation, clients' cephx credentials need reauthorized "
+ "to access the file system and its pools with the new name. "
+ "Add --yes-i-really-mean-it if you are sure you wish to continue.";
+ return -EPERM;
+ }
+
+ if (!mon->osdmon()->is_writeable()) {
+ // not allowed to write yet, so retry when we can
+ mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op));
+ return -EAGAIN;
+ }
+
+ // Check that no MDS daemons is up for this CephFS.
+ if (fsp->get_mds_map().get_num_up_mds() > 0) {
+ ss << "CephFS '" << fs_name << "' is not offline. Before renaming "
+ << "a CephFS, it must be marked as down. See `ceph fs fail`.";
+ return -EPERM;
+ }
+
+ // Check that refuse_client_session is set.
+ if (!fsp->get_mds_map().test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) {
+ ss << "CephFS '" << fs_name << "' doesn't refuse clients. Before "
+ << "renaming a CephFS, flag 'refuse_client_session' must be set. "
+ << "See `ceph fs set`.";
+ return -EPERM;
+ }
+
+ for (const auto p : fsp->get_mds_map().get_data_pools()) {
+ mon->osdmon()->do_application_enable(p, APP_NAME_CEPHFS, "data",
+ new_fs_name, true);
+ }
+
+ mon->osdmon()->do_application_enable(
+ fsp->get_mds_map().get_metadata_pool(), APP_NAME_CEPHFS, "metadata",
+ new_fs_name, true);
+ mon->osdmon()->propose_pending();
+
+ auto f = [new_fs_name](auto&& fs) {
+ fs.get_mds_map().set_fs_name(new_fs_name);
+ };
+ fsmap.modify_filesystem(fsp->get_fscid(), std::move(f));
+
+ ss << "File system is renamed. cephx credentials authorized to "
+ "old file system name need to be reauthorized to new file "
+ "system name.";
+
+ return 0;
+ }
+
+private:
+ Paxos *m_paxos;
+};
+
+class SwapFilesystemHandler : public FileSystemCommandHandler
+{
+ public:
+ explicit SwapFilesystemHandler(Paxos *paxos)
+ : FileSystemCommandHandler("fs swap"), m_paxos(paxos)
+ {
+ }
+
+ int handle(Monitor *mon, FSMap& fsmap, MonOpRequestRef op,
+ const cmdmap_t& cmdmap, std::ostream &ss) override
+ {
+ ceph_assert(m_paxos->is_plugged());
+
+ // Check for confirmation flag
+ bool confirmation_flag = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", confirmation_flag);
+ if (!confirmation_flag) {
+ ss << "This is a potentially disruptive operation, client\'s cephx "
+ "credentials may need to be reauthorized to access the file systems "
+ "and its pools. Add --yes-i-really-mean-it if you are sure you wish "
+ "to continue.";
+ return -EPERM;
+ }
+
+ string fs1_name, fs2_name;
+ int64_t fs1_id = FS_CLUSTER_ID_NONE;
+ int64_t fs2_id = FS_CLUSTER_ID_NONE;
+ string swap_fscids_flag;
+ cmd_getval(cmdmap, "fs1_name", fs1_name);
+ cmd_getval(cmdmap, "fs2_name", fs2_name);
+ cmd_getval(cmdmap, "fs1_id", fs1_id);
+ cmd_getval(cmdmap, "fs2_id", fs2_id);
+ cmd_getval(cmdmap, "swap_fscids", swap_fscids_flag);
+ auto fs1p = fsmap.get_filesystem(fs1_name);
+ auto fs2p = fsmap.get_filesystem(fs2_name);
+
+ // Check that CephFSs exists for both given names.
+ if (fs1p == nullptr || fs2p == nullptr) {
+ if (fs1p == nullptr && fs2p != nullptr) {
+ ss << "File system '" << fs1_name << "' doesn\'t exist on this "
+ "Ceph cluster.";
+ return -ENOENT;
+ } else if (fs1p != nullptr && fs2p == nullptr) {
+ ss << "File system '" << fs2_name << "' doesn\'t exist on this "
+ "Ceph cluster.";
+ return -ENOENT;
+ } else {
+ ss << "Neither file system '" << fs1_name << "' nor file "
+ "system '" << fs2_name << "' exists on this Ceph cluster.";
+ return -ENOENT;
+ }
+ }
+
+ // Check that FSCID provided for both CephFSs is correct.
+ if (fs1_id != fs1p->get_fscid() || fs2_id != fs2p->get_fscid()) {
+ if (fs1_id != fs1p->get_fscid() && fs2_id == fs2p->get_fscid()) {
+ ss << "FSCID provided for '" << fs1_name << "' is incorrect.";
+ return -EINVAL;
+ } else if (fs1_id == fs1p->get_fscid() && fs2_id != fs2p->get_fscid()) {
+ ss << "FSCID provided for '" << fs2_name << "' is incorrect.";
+ return -EINVAL;
+ } else if (fs1_id != fs1p->get_fscid() && fs2_id != fs2p->get_fscid()) {
+ if (fs1_id == fs2p->get_fscid() && fs2_id == fs1p->get_fscid()) {
+ ss << "FSCIDs provided in command arguments are swapped; perhaps "
+ << "`ceph fs swap` has been run before.";
+ return 0;
+ } else {
+ ss << "FSCIDs provided for both the CephFSs is incorrect.";
+ return -EINVAL;
+ }
+ }
+ }
+
+ // Check that CephFS mirroring for both CephFSs is disabled.
+ if (fs1p->get_mirror_info().mirrored || fs2p->get_mirror_info().mirrored) {
+ if (fs1p->get_mirror_info().mirrored &&
+ !fs2p->get_mirror_info().mirrored) {
+ ss << "Mirroring is enabled on file system '"<< fs1_name << "'. "
+ << "Disable mirroring on the file system after ensuring it's OK "
+ << "to do so, and then re-try swapping.";
+ return -EPERM;
+ } else if (!fs1p->get_mirror_info().mirrored &&
+ fs2p->get_mirror_info().mirrored) {
+ ss << "Mirroring is enabled on file system '"<< fs2_name << "'. "
+ << "Disable mirroring on the file system after ensuring it's OK "
+ << "to do so, and then re-try swapping.";
+ return -EPERM;
+ } else {
+ ss << "Mirroring is enabled on file systems '" << fs1_name << "' "
+ << "and '" << fs2_name << "'. Disable mirroring on both the "
+ << "file systems after ensuring it's OK to do so, and then re-try "
+ << "swapping.";
+ return -EPERM;
+ }
+ }
+
+ if (!mon->osdmon()->is_writeable()) {
+ // not allowed to write yet, so retry when we can
+ mon->osdmon()->wait_for_writeable(
+ op, new PaxosService::C_RetryMessage(mon->mdsmon(), op));
+ return -EAGAIN;
+ }
+
+ // Check that both CephFS have been marked as down, IOW has no MDS
+ // associated with it.
+ if (fs1p->get_mds_map().get_num_up_mds() > 0 ||
+ fs2p->get_mds_map().get_num_up_mds() > 0) {
+ if (fs1p->get_mds_map().get_num_up_mds() > 0 &&
+ fs2p->get_mds_map().get_num_up_mds() == 0) {
+ ss << "CephFS '" << fs1_name << "' is not offline. Before swapping "
+ << "CephFS names, both CephFSs should be marked as failed. See "
+ << "`ceph fs fail`.";
+ return -EPERM;
+ } else if (fs1p->get_mds_map().get_num_up_mds() == 0 &&
+ fs2p->get_mds_map().get_num_up_mds() > 0) {
+ ss << "CephFS '" << fs2_name << "' is not offline. Before swapping "
+ << "CephFS names, both CephFSs should be marked as failed. See "
+ << "`ceph fs fail`.";
+ return -EPERM;
+ } else {
+ ss << "CephFSs '" << fs1_name << "' and '" << fs2_name << "' "
+ << "are not offline. Before swapping CephFS names, both CephFSs "
+ << "should be marked as failed. See `ceph fs fail`.";
+ return -EPERM;
+ }
+ }
+
+ // Check that refuse_client_session is set.
+ if (!fs1p->get_mds_map().test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION) ||
+ !fs2p->get_mds_map().test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) {
+ if (!fs1p->get_mds_map().test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION) &&
+ fs2p->get_mds_map().test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) {
+ ss << "CephFS '" << fs1_name << "' doesn't refuse clients. Before "
+ << "swapping CephFS names, flag 'refuse_client_session' must be "
+ << "set. See `ceph fs set`.";
+ return -EPERM;
+ } else if (
+ fs1p->get_mds_map().test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION) &&
+ !fs2p->get_mds_map().test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) {
+ ss << "CephFS '" << fs2_name << "' doesn't refuse clients. Before "
+ << "swapping CephFS names, flag 'refuse_client_session' must be "
+ << "set. See `ceph fs set`.";
+ return -EPERM;
+ } else if (
+ !fs1p->get_mds_map().test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION) &&
+ !fs2p->get_mds_map().test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) {
+ ss << "CephFSs '" << fs1_name << "' and '" << fs2_name << "' do not "
+ << "refuse clients. Before swapping CephFS names, flag "
+ << "'refuse_client_session' must be set. See `ceph fs set`.";
+ return -EPERM;
+ }
+ }
+
+ // Finally, the swap begins.
+ // Swap CephFS names on OSD pool application tag
+ for (const auto p : fs1p->get_mds_map().get_data_pools()) {
+ mon->osdmon()->do_application_enable(p, APP_NAME_CEPHFS, "data",
+ fs2_name, true);
+ }
+ mon->osdmon()->do_application_enable(
+ fs1p->get_mds_map().get_metadata_pool(), APP_NAME_CEPHFS, "metadata",
+ fs2_name, true);
+
+ for (const auto p : fs2p->get_mds_map().get_data_pools()) {
+ mon->osdmon()->do_application_enable(p, APP_NAME_CEPHFS, "data",
+ fs1_name, true);
+ }
+ mon->osdmon()->do_application_enable(
+ fs2p->get_mds_map().get_metadata_pool(), APP_NAME_CEPHFS, "metadata",
+ fs1_name, true);
+ mon->osdmon()->propose_pending();
+
+ // Now swap CephFS names and, optionally, FSCIDs.
+ auto renamefunc1 = [fs2_name](auto&& fs) {
+ fs.get_mds_map().set_fs_name(fs2_name);
+ };
+ fsmap.modify_filesystem(fs1_id, std::move(renamefunc1));
+ auto renamefunc2 = [fs1_name](auto&& fs) {
+ fs.get_mds_map().set_fs_name(fs1_name);
+ };
+ fsmap.modify_filesystem(fs2_id, std::move(renamefunc2));
+
+ if (swap_fscids_flag == "yes") {
+ fsmap.swap_fscids(fs1_id, fs2_id);
+ }
+
+ ss << "File system names ";
+ if (swap_fscids_flag == "yes") {
+ ss << "and FSCIDs ";
+ }
+ ss << " have been swapped; cephx credentials may need an upgrade.";
+
+ return 0;
+ }
+
+private:
+ Paxos *m_paxos;
+};
+
+class RemoveDataPoolHandler : public FileSystemCommandHandler
+{
+ public:
+ RemoveDataPoolHandler()
+ : FileSystemCommandHandler("fs rm_data_pool")
+ {}
+
+ int handle(
+ Monitor *mon,
+ FSMap& fsmap,
+ MonOpRequestRef op,
+ const cmdmap_t& cmdmap,
+ ostream &ss) override
+ {
+ string poolname;
+ cmd_getval(cmdmap, "pool", poolname);
+
+ string fs_name;
+ if (!cmd_getval(cmdmap, "fs_name", fs_name)
+ || fs_name.empty()) {
+ ss << "Missing filesystem name";
+ return -EINVAL;
+ }
+
+ int64_t poolid = mon->osdmon()->osdmap.lookup_pg_pool_name(poolname);
+ if (poolid < 0) {
+ string err;
+ poolid = strict_strtol(poolname.c_str(), 10, &err);
+ if (err.length()) {
+ ss << "pool '" << poolname << "' does not exist";
+ return -ENOENT;
+ } else if (poolid < 0) {
+ ss << "invalid pool id '" << poolid << "'";
+ return -EINVAL;
+ }
+ }
+
+ ceph_assert(poolid >= 0); // Checked by parsing code above
+
+ auto* fsp = fsmap.get_filesystem(fs_name);
+ if (fsp == nullptr) {
+ ss << "filesystem '" << fs_name << "' does not exist";
+ return -ENOENT;
+ }
+
+ if (fsp->get_mds_map().get_first_data_pool() == poolid) {
+ ss << "cannot remove default data pool";
+ return -EINVAL;
+ }
+
+ int r = 0;
+ fsmap.modify_filesystem(fsp->get_fscid(),
+ [&r, poolid](auto&& fs)
+ {
+ r = fs.get_mds_map().remove_data_pool(poolid);
+ });
+ if (r == -ENOENT) {
+ // It was already removed, succeed in silence
+ return 0;
+ } else if (r == 0) {
+ // We removed it, succeed
+ ss << "removed data pool " << poolid << " from fsmap";
+ return 0;
+ } else {
+ // Unexpected error, bubble up
+ return r;
+ }
+ }
+};
+
+/**
+ * For commands with an alternative prefix
+ */
+template<typename T>
+class AliasHandler : public T
+{
+ string alias_prefix;
+
+ public:
+ explicit AliasHandler(const string &new_prefix)
+ : T()
+ {
+ alias_prefix = new_prefix;
+ }
+
+ string const &get_prefix() const override {return alias_prefix;}
+
+ int handle(
+ Monitor *mon,
+ FSMap& fsmap,
+ MonOpRequestRef op,
+ const cmdmap_t& cmdmap,
+ ostream &ss) override
+ {
+ return T::handle(mon, fsmap, op, cmdmap, ss);
+ }
+};
+
+class MirrorHandlerEnable : public FileSystemCommandHandler
+{
+public:
+ MirrorHandlerEnable()
+ : FileSystemCommandHandler("fs mirror enable")
+ {}
+
+ int handle(Monitor *mon,
+ FSMap &fsmap, MonOpRequestRef op,
+ const cmdmap_t& cmdmap, ostream &ss) override {
+ string fs_name;
+ if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+ ss << "Missing filesystem name";
+ return -EINVAL;
+ }
+
+ auto* fsp = fsmap.get_filesystem(fs_name);
+ if (fsp == nullptr) {
+ ss << "Filesystem '" << fs_name << "' not found";
+ return -ENOENT;
+ }
+
+ if (fsp->get_mirror_info().is_mirrored()) {
+ return 0;
+ }
+
+ auto f = [](auto&& fs) {
+ fs.get_mirror_info().enable_mirroring();
+ };
+ fsmap.modify_filesystem(fsp->get_fscid(), std::move(f));
+
+ return 0;
+ }
+};
+
+class MirrorHandlerDisable : public FileSystemCommandHandler
+{
+public:
+ MirrorHandlerDisable()
+ : FileSystemCommandHandler("fs mirror disable")
+ {}
+
+ int handle(Monitor *mon,
+ FSMap &fsmap, MonOpRequestRef op,
+ const cmdmap_t& cmdmap, ostream &ss) override {
+ string fs_name;
+ if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+ ss << "Missing filesystem name";
+ return -EINVAL;
+ }
+
+ auto* fsp = fsmap.get_filesystem(fs_name);
+ if (fsp == nullptr) {
+ ss << "Filesystem '" << fs_name << "' not found";
+ return -ENOENT;
+ }
+
+ if (!fsp->get_mirror_info().is_mirrored()) {
+ return 0;
+ }
+
+ auto f = [](auto&& fs) {
+ fs.get_mirror_info().disable_mirroring();
+ };
+ fsmap.modify_filesystem(fsp->get_fscid(), std::move(f));
+
+ return 0;
+ }
+};
+
+class MirrorHandlerAddPeer : public FileSystemCommandHandler
+{
+public:
+ MirrorHandlerAddPeer()
+ : FileSystemCommandHandler("fs mirror peer_add")
+ {}
+
+ boost::optional<pair<string, string>>
+ extract_remote_cluster_conf(const string &spec) {
+ auto pos = spec.find("@");
+ if (pos == string_view::npos) {
+ return boost::optional<pair<string, string>>();
+ }
+
+ auto client = spec.substr(0, pos);
+ auto cluster = spec.substr(pos+1);
+
+ return make_pair(client, cluster);
+ }
+
+ bool peer_add(FSMap &fsmap, const Filesystem& fs,
+ const cmdmap_t &cmdmap, ostream &ss) {
+ string peer_uuid;
+ string remote_spec;
+ string remote_fs_name;
+ cmd_getval(cmdmap, "uuid", peer_uuid);
+ cmd_getval(cmdmap, "remote_cluster_spec", remote_spec);
+ cmd_getval(cmdmap, "remote_fs_name", remote_fs_name);
+
+ // verify (and extract) remote cluster specification
+ auto remote_conf = extract_remote_cluster_conf(remote_spec);
+ if (!remote_conf) {
+ ss << "invalid remote cluster spec -- should be <client>@<cluster>";
+ return false;
+ }
+
+ if (fs.get_mirror_info().has_peer(peer_uuid)) {
+ ss << "peer already exists";
+ return true;
+ }
+ if (fs.get_mirror_info().has_peer((*remote_conf).first, (*remote_conf).second,
+ remote_fs_name)) {
+ ss << "peer already exists";
+ return true;
+ }
+
+ auto f = [peer_uuid, remote_conf, remote_fs_name](auto&& fs) {
+ fs.get_mirror_info().peer_add(peer_uuid, (*remote_conf).first,
+ (*remote_conf).second, remote_fs_name);
+ };
+ fsmap.modify_filesystem(fs.get_fscid(), std::move(f));
+ return true;
+ }
+
+ int handle(Monitor *mon,
+ FSMap &fsmap, MonOpRequestRef op,
+ const cmdmap_t& cmdmap, ostream &ss) override {
+ string fs_name;
+ if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+ ss << "Missing filesystem name";
+ return -EINVAL;
+ }
+
+ auto* fsp = fsmap.get_filesystem(fs_name);
+ if (fsp == nullptr) {
+ ss << "Filesystem '" << fs_name << "' not found";
+ return -ENOENT;
+ }
+
+ if (!fsp->get_mirror_info().is_mirrored()) {
+ ss << "Mirroring not enabled for filesystem '" << fs_name << "'";
+ return -EINVAL;
+ }
+
+ auto res = peer_add(fsmap, *fsp, cmdmap, ss);
+ if (!res) {
+ return -EINVAL;
+ }
+
+ return 0;
+ }
+};
+
+class MirrorHandlerRemovePeer : public FileSystemCommandHandler
+{
+public:
+ MirrorHandlerRemovePeer()
+ : FileSystemCommandHandler("fs mirror peer_remove")
+ {}
+
+ bool peer_remove(FSMap &fsmap, const Filesystem& fs,
+ const cmdmap_t &cmdmap, ostream &ss) {
+ string peer_uuid;
+ cmd_getval(cmdmap, "uuid", peer_uuid);
+
+ if (!fs.get_mirror_info().has_peer(peer_uuid)) {
+ ss << "cannot find peer with uuid: " << peer_uuid;
+ return true;
+ }
+
+ auto f = [peer_uuid](auto&& fs) {
+ fs.get_mirror_info().peer_remove(peer_uuid);
+ };
+ fsmap.modify_filesystem(fs.get_fscid(), std::move(f));
+ return true;
+ }
+
+ int handle(Monitor *mon,
+ FSMap &fsmap, MonOpRequestRef op,
+ const cmdmap_t& cmdmap, ostream &ss) override {
+ string fs_name;
+ if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+ ss << "Missing filesystem name";
+ return -EINVAL;
+ }
+
+ auto* fsp = fsmap.get_filesystem(fs_name);
+ if (fsp == nullptr) {
+ ss << "Filesystem '" << fs_name << "' not found";
+ return -ENOENT;
+ }
+
+ if (!fsp->get_mirror_info().is_mirrored()) {
+ ss << "Mirroring not enabled for filesystem '" << fs_name << "'";
+ return -EINVAL;
+ }
+
+ auto res = peer_remove(fsmap, *fsp, cmdmap, ss);
+ if (!res) {
+ return -EINVAL;
+ }
+
+ return 0;
+ }
+};
+
+list<std::shared_ptr<FileSystemCommandHandler> >
+FileSystemCommandHandler::load(Paxos *paxos)
+{
+ list<std::shared_ptr<FileSystemCommandHandler> > handlers;
+
+ handlers.push_back(std::make_shared<SetHandler>());
+ handlers.push_back(std::make_shared<FailHandler>());
+ handlers.push_back(std::make_shared<FlagSetHandler>());
+ handlers.push_back(std::make_shared<CompatSetHandler>());
+ handlers.push_back(std::make_shared<RequiredClientFeaturesHandler>());
+ handlers.push_back(std::make_shared<AddDataPoolHandler>(paxos));
+ handlers.push_back(std::make_shared<RemoveDataPoolHandler>());
+ handlers.push_back(std::make_shared<FsNewHandler>(paxos));
+ handlers.push_back(std::make_shared<RemoveFilesystemHandler>());
+ handlers.push_back(std::make_shared<ResetFilesystemHandler>());
+ handlers.push_back(std::make_shared<RenameFilesystemHandler>(paxos));
+ handlers.push_back(std::make_shared<SwapFilesystemHandler>(paxos));
+
+ handlers.push_back(std::make_shared<SetDefaultHandler>());
+ handlers.push_back(std::make_shared<AliasHandler<SetDefaultHandler> >(
+ "fs set_default"));
+ handlers.push_back(std::make_shared<MirrorHandlerEnable>());
+ handlers.push_back(std::make_shared<MirrorHandlerDisable>());
+ handlers.push_back(std::make_shared<MirrorHandlerAddPeer>());
+ handlers.push_back(std::make_shared<MirrorHandlerRemovePeer>());
+
+ return handlers;
+}
+
+int FileSystemCommandHandler::_check_pool(
+ OSDMap &osd_map,
+ const int64_t pool_id,
+ int type,
+ bool force,
+ ostream *ss,
+ bool allow_overlay) const
+{
+ ceph_assert(ss != NULL);
+
+ const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
+ if (!pool) {
+ *ss << "pool id '" << pool_id << "' does not exist";
+ return -ENOENT;
+ }
+
+ if (pool->has_snaps()) {
+ *ss << "pool(" << pool_id <<") already has mon-managed snaps; "
+ "can't attach pool to fs";
+ return -EOPNOTSUPP;
+ }
+
+ const string& pool_name = osd_map.get_pool_name(pool_id);
+ auto app_map = pool->application_metadata;
+
+ if (!allow_overlay && !force && !app_map.empty()) {
+ auto app = app_map.find(APP_NAME_CEPHFS);
+ if (app != app_map.end()) {
+ auto& [app_name, app_metadata] = *app;
+ auto itr = app_metadata.find("data");
+ if (itr == app_metadata.end()) {
+ itr = app_metadata.find("metadata");
+ }
+ if (itr != app_metadata.end()) {
+ auto& [type, filesystem] = *itr;
+ *ss << "RADOS pool '" << pool_name << "' is already used by filesystem '"
+ << filesystem << "' as a '" << type << "' pool for application '"
+ << app_name << "'";
+ return -EINVAL;
+ }
+ } else {
+ *ss << "RADOS pool '" << pool_name
+ << "' has another non-CephFS application enabled.";
+ return -EINVAL;
+ }
+ }
+
+ if (pool->is_erasure()) {
+ if (type == POOL_METADATA) {
+ *ss << "pool '" << pool_name << "' (id '" << pool_id << "')"
+ << " is an erasure-coded pool. Use of erasure-coded pools"
+ << " for CephFS metadata is not permitted";
+ return -EINVAL;
+ } else if (type == POOL_DATA_DEFAULT && !force) {
+ *ss << "pool '" << pool_name << "' (id '" << pool_id << "')"
+ " is an erasure-coded pool."
+ " Use of an EC pool for the default data pool is discouraged;"
+ " see the online CephFS documentation for more information."
+ " Use --force to override.";
+ return -EINVAL;
+ } else if (!pool->allows_ecoverwrites()) {
+ // non-overwriteable EC pools are only acceptable with a cache tier overlay
+ if (!pool->has_tiers() || !pool->has_read_tier() || !pool->has_write_tier()) {
+ *ss << "pool '" << pool_name << "' (id '" << pool_id << "')"
+ << " is an erasure-coded pool, with no overwrite support";
+ return -EINVAL;
+ }
+
+ // That cache tier overlay must be writeback, not readonly (it's the
+ // write operations like modify+truncate we care about support for)
+ const pg_pool_t *write_tier = osd_map.get_pg_pool(
+ pool->write_tier);
+ ceph_assert(write_tier != NULL); // OSDMonitor shouldn't allow DNE tier
+ if (write_tier->cache_mode == pg_pool_t::CACHEMODE_FORWARD
+ || write_tier->cache_mode == pg_pool_t::CACHEMODE_READONLY) {
+ *ss << "EC pool '" << pool_name << "' has a write tier ("
+ << osd_map.get_pool_name(pool->write_tier)
+ << ") that is configured "
+ "to forward writes. Use a cache mode such as 'writeback' for "
+ "CephFS";
+ return -EINVAL;
+ }
+ }
+ }
+
+ if (pool->is_tier()) {
+ *ss << " pool '" << pool_name << "' (id '" << pool_id
+ << "') is already in use as a cache tier.";
+ return -EINVAL;
+ }
+
+ if (!force && !pool->application_metadata.empty() &&
+ pool->application_metadata.count(APP_NAME_CEPHFS) == 0) {
+ *ss << " pool '" << pool_name << "' (id '" << pool_id
+ << "') has a non-CephFS application enabled.";
+ return -EINVAL;
+ }
+
+ if (type != POOL_METADATA && pool->pg_autoscale_mode == pg_pool_t::pg_autoscale_mode_t::ON && !pool->has_flag(pg_pool_t::FLAG_BULK)) {
+ // TODO: consider issuing an info event in this case
+ *ss << " Pool '" << pool_name << "' (id '" << pool_id
+ << "') has pg autoscale mode 'on' but is not marked as bulk." << std::endl
+ << " Consider setting the flag by running" << std::endl
+ << " # ceph osd pool set " << pool_name << " bulk true" << std::endl;
+ }
+
+ // Nothing special about this pool, so it is permissible
+ return 0;
+}
+
+int FileSystemCommandHandler::is_op_allowed(
+ const MonOpRequestRef& op, const FSMap& fsmap, const cmdmap_t& cmdmap,
+ ostream &ss) const
+{
+ string fs_name;
+ cmd_getval(cmdmap, "fs_name", fs_name);
+
+ // so that fsmap can filtered and the original copy is untouched.
+ FSMap fsmap_copy = fsmap;
+ fsmap_copy.filter(op->get_session()->get_allowed_fs_names());
+
+ auto* fsp = fsmap_copy.get_filesystem(fs_name);
+ if (fsp == nullptr) {
+ auto prefix = get_prefix();
+ /* let "fs rm" and "fs rename" handle idempotent cases where file systems do not exist */
+ if (!(prefix == "fs rm" || prefix == "fs rename" || prefix == "fs swap") &&
+ fsmap.get_filesystem(fs_name) == nullptr) {
+ ss << "Filesystem not found: '" << fs_name << "'";
+ return -ENOENT;
+ }
+ }
+
+ if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
+ ss << "Permission denied: '" << fs_name << "'";
+ return -EPERM;
+ }
+
+ return 1;
+}
diff --git a/src/mon/FSCommands.h b/src/mon/FSCommands.h
new file mode 100644
index 000000000..3cac437a6
--- /dev/null
+++ b/src/mon/FSCommands.h
@@ -0,0 +1,98 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat Ltd
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef FS_COMMANDS_H_
+#define FS_COMMANDS_H_
+
+#include "Monitor.h"
+#include "CommandHandler.h"
+
+#include "osd/OSDMap.h"
+#include "mds/FSMap.h"
+
+#include <string>
+#include <ostream>
+
+class FileSystemCommandHandler : protected CommandHandler
+{
+protected:
+ std::string prefix;
+
+ enum {
+ POOL_METADATA,
+ POOL_DATA_DEFAULT,
+ POOL_DATA_EXTRA,
+ };
+ /**
+ * Return 0 if the pool is suitable for use with CephFS, or
+ * in case of errors return a negative error code, and populate
+ * the passed ostream with an explanation.
+ *
+ * @param metadata whether the pool will be for metadata (stricter checks)
+ */
+ int _check_pool(
+ OSDMap &osd_map,
+ const int64_t pool_id,
+ int type,
+ bool force,
+ std::ostream *ss,
+ bool allow_overlay = false) const;
+
+ virtual std::string const &get_prefix() const {return prefix;}
+
+public:
+ FileSystemCommandHandler(const std::string &prefix_)
+ : prefix(prefix_)
+ {}
+
+ virtual ~FileSystemCommandHandler()
+ {}
+
+ int is_op_allowed(const MonOpRequestRef& op, const FSMap& fsmap,
+ const cmdmap_t& cmdmap, std::ostream &ss) const;
+
+ int can_handle(std::string const &prefix_, MonOpRequestRef& op, FSMap& fsmap,
+ const cmdmap_t& cmdmap, std::ostream &ss) const
+ {
+ if (get_prefix() != prefix_) {
+ return 0;
+ }
+
+ if (get_prefix() == "fs new" || get_prefix() == "fs flag set") {
+ return 1;
+ }
+
+ return is_op_allowed(op, fsmap, cmdmap, ss);
+ }
+
+ static std::list<std::shared_ptr<FileSystemCommandHandler> > load(Paxos *paxos);
+
+ virtual int handle(
+ Monitor *mon,
+ FSMap &fsmap,
+ MonOpRequestRef op,
+ const cmdmap_t& cmdmap,
+ std::ostream &ss) = 0;
+};
+
+
+static constexpr auto errmsg_for_unhealthy_mds = \
+ "MDS has one of two health warnings which could extend recovery: "
+ "MDS_TRIM or MDS_CACHE_OVERSIZED. MDS failover is not recommended "
+ "since it might cause unexpected file system unavailability. If "
+ "you wish to proceed, pass --yes-i-really-mean-it";
+
+
+#endif
diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc
new file mode 100644
index 000000000..45563f87d
--- /dev/null
+++ b/src/mon/HealthMonitor.cc
@@ -0,0 +1,887 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <stdlib.h>
+#include <limits.h>
+#include <sstream>
+#include <regex>
+#include <time.h>
+#include <iterator>
+
+#include "include/ceph_assert.h"
+#include "include/common_fwd.h"
+#include "include/stringify.h"
+
+#include "mon/Monitor.h"
+#include "mon/HealthMonitor.h"
+
+#include "messages/MMonHealthChecks.h"
+
+#include "common/Formatter.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, this)
+using namespace TOPNSPC::common;
+
+using namespace std::literals;
+using std::cerr;
+using std::cout;
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::setfill;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+using std::unique_ptr;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::parse_timespan;
+using ceph::timespan_str;
+static ostream& _prefix(std::ostream *_dout, const Monitor &mon,
+ const HealthMonitor *hmon) {
+ return *_dout << "mon." << mon.name << "@" << mon.rank
+ << "(" << mon.get_state_name() << ").health ";
+}
+
+HealthMonitor::HealthMonitor(Monitor &m, Paxos &p, const string& service_name)
+ : PaxosService(m, p, service_name) {
+}
+
+void HealthMonitor::init()
+{
+ dout(10) << __func__ << dendl;
+}
+
+void HealthMonitor::create_initial()
+{
+ dout(10) << __func__ << dendl;
+}
+
+void HealthMonitor::update_from_paxos(bool *need_bootstrap)
+{
+ version = get_last_committed();
+ dout(10) << __func__ << dendl;
+ load_health();
+
+ bufferlist qbl;
+ mon.store->get(service_name, "quorum", qbl);
+ if (qbl.length()) {
+ auto p = qbl.cbegin();
+ decode(quorum_checks, p);
+ } else {
+ quorum_checks.clear();
+ }
+
+ bufferlist lbl;
+ mon.store->get(service_name, "leader", lbl);
+ if (lbl.length()) {
+ auto p = lbl.cbegin();
+ decode(leader_checks, p);
+ } else {
+ leader_checks.clear();
+ }
+
+ {
+ bufferlist bl;
+ mon.store->get(service_name, "mutes", bl);
+ if (bl.length()) {
+ auto p = bl.cbegin();
+ decode(mutes, p);
+ } else {
+ mutes.clear();
+ }
+ }
+
+ dout(20) << "dump:";
+ JSONFormatter jf(true);
+ jf.open_object_section("health");
+ jf.open_object_section("quorum_health");
+ for (auto& p : quorum_checks) {
+ string s = string("mon.") + stringify(p.first);
+ jf.dump_object(s.c_str(), p.second);
+ }
+ jf.close_section();
+ jf.dump_object("leader_health", leader_checks);
+ jf.close_section();
+ jf.flush(*_dout);
+ *_dout << dendl;
+}
+
+void HealthMonitor::create_pending()
+{
+ dout(10) << " " << version << dendl;
+ pending_mutes = mutes;
+}
+
+void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+ ++version;
+ dout(10) << " " << version << dendl;
+ put_last_committed(t, version);
+
+ bufferlist qbl;
+ encode(quorum_checks, qbl);
+ t->put(service_name, "quorum", qbl);
+ bufferlist lbl;
+ encode(leader_checks, lbl);
+ t->put(service_name, "leader", lbl);
+ {
+ bufferlist bl;
+ encode(pending_mutes, bl);
+ t->put(service_name, "mutes", bl);
+ }
+
+ health_check_map_t pending_health;
+
+ // combine per-mon details carefully...
+ map<string,set<string>> names; // code -> <mon names>
+ for (auto p : quorum_checks) {
+ for (auto q : p.second.checks) {
+ names[q.first].insert(mon.monmap->get_name(p.first));
+ }
+ pending_health.merge(p.second);
+ }
+ for (auto &p : pending_health.checks) {
+ p.second.summary = std::regex_replace(
+ p.second.summary,
+ std::regex("%hasorhave%"),
+ names[p.first].size() > 1 ? "have" : "has");
+ p.second.summary = std::regex_replace(
+ p.second.summary,
+ std::regex("%names%"), stringify(names[p.first]));
+ p.second.summary = std::regex_replace(
+ p.second.summary,
+ std::regex("%plurals%"),
+ names[p.first].size() > 1 ? "s" : "");
+ p.second.summary = std::regex_replace(
+ p.second.summary,
+ std::regex("%isorare%"),
+ names[p.first].size() > 1 ? "are" : "is");
+ }
+
+ pending_health.merge(leader_checks);
+ encode_health(pending_health, t);
+}
+
+version_t HealthMonitor::get_trim_to() const
+{
+ // we don't actually need *any* old states, but keep a few.
+ if (version > 5) {
+ return version - 5;
+ }
+ return 0;
+}
+
+bool HealthMonitor::preprocess_query(MonOpRequestRef op)
+{
+ auto m = op->get_req<PaxosServiceMessage>();
+ switch (m->get_type()) {
+ case MSG_MON_COMMAND:
+ return preprocess_command(op);
+ case MSG_MON_HEALTH_CHECKS:
+ return false;
+ default:
+ mon.no_reply(op);
+ derr << "Unhandled message type " << m->get_type() << dendl;
+ return true;
+ }
+}
+
+bool HealthMonitor::prepare_update(MonOpRequestRef op)
+{
+ Message *m = op->get_req();
+ dout(7) << "prepare_update " << *m
+ << " from " << m->get_orig_source_inst() << dendl;
+ switch (m->get_type()) {
+ case MSG_MON_HEALTH_CHECKS:
+ return prepare_health_checks(op);
+ case MSG_MON_COMMAND:
+ return prepare_command(op);
+ default:
+ return false;
+ }
+}
+
+bool HealthMonitor::preprocess_command(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonCommand>();
+ std::stringstream ss;
+ bufferlist rdata;
+
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+ return true;
+ }
+
+ MonSession *session = op->get_session();
+ if (!session) {
+ mon.reply_command(op, -EACCES, "access denied", rdata,
+ get_last_committed());
+ return true;
+ }
+ // more sanity checks
+ try {
+ string format;
+ cmd_getval(cmdmap, "format", format);
+ string prefix;
+ cmd_getval(cmdmap, "prefix", prefix);
+ } catch (const bad_cmd_get& e) {
+ mon.reply_command(op, -EINVAL, e.what(), rdata, get_last_committed());
+ return true;
+ }
+ return false;
+}
+
+bool HealthMonitor::prepare_command(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonCommand>();
+
+ std::stringstream ss;
+ bufferlist rdata;
+
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+ return true;
+ }
+
+ MonSession *session = op->get_session();
+ if (!session) {
+ mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
+ return true;
+ }
+
+ string format = cmd_getval_or<string>(cmdmap, "format", "plain");
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+ string prefix;
+ cmd_getval(cmdmap, "prefix", prefix);
+
+ int r = 0;
+
+ if (prefix == "health mute") {
+ string code;
+ bool sticky = false;
+ if (!cmd_getval(cmdmap, "code", code) ||
+ code == "") {
+ r = -EINVAL;
+ ss << "must specify an alert code to mute";
+ goto out;
+ }
+ cmd_getval(cmdmap, "sticky", sticky);
+ string ttl_str;
+ utime_t ttl;
+ std::chrono::seconds secs;
+ if (cmd_getval(cmdmap, "ttl", ttl_str)) {
+ try {
+ secs = parse_timespan(ttl_str);
+ if (secs == 0s) {
+ throw std::invalid_argument("timespan = 0");
+ }
+ } catch (const std::invalid_argument& e) {
+ ss << "invalid duration: " << ttl_str << " (" << e.what() << ")";
+ r = -EINVAL;
+ goto out;
+ }
+
+ ttl = ceph_clock_now();
+ ttl += std::chrono::duration<double>(secs).count();
+ }
+ health_check_map_t all;
+ gather_all_health_checks(&all);
+ string summary;
+ int64_t count = 0;
+ if (!sticky) {
+ auto p = all.checks.find(code);
+ if (p == all.checks.end()) {
+ r = -ENOENT;
+ ss << "health alert " << code << " is not currently raised";
+ goto out;
+ }
+ count = p->second.count;
+ summary = p->second.summary;
+ }
+ auto& m = pending_mutes[code];
+ m.code = code;
+ m.ttl = ttl;
+ m.sticky = sticky;
+ m.summary = summary;
+ m.count = count;
+ } else if (prefix == "health unmute") {
+ string code;
+ if (cmd_getval(cmdmap, "code", code)) {
+ pending_mutes.erase(code);
+ } else {
+ pending_mutes.clear();
+ }
+ } else {
+ ss << "Command '" << prefix << "' not implemented!";
+ r = -ENOSYS;
+ }
+
+out:
+ dout(4) << __func__ << " done, r=" << r << dendl;
+ /* Compose response */
+ string rs;
+ getline(ss, rs);
+
+ if (r >= 0) {
+ // success.. delay reply
+ wait_for_commit(op, new Monitor::C_Command(mon, op, r, rs,
+ get_last_committed() + 1));
+ return true;
+ } else {
+ // reply immediately
+ mon.reply_command(op, r, rs, rdata, get_last_committed());
+ return false;
+ }
+}
+
+bool HealthMonitor::prepare_health_checks(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonHealthChecks>();
+ // no need to check if it's changed, the peon has done so
+ quorum_checks[m->get_source().num()] = std::move(m->health_checks);
+ return true;
+}
+
+void HealthMonitor::tick()
+{
+ if (!is_active()) {
+ return;
+ }
+ dout(10) << __func__ << dendl;
+ bool changed = false;
+ if (check_member_health()) {
+ changed = true;
+ }
+ if (!mon.is_leader()) {
+ return;
+ }
+ if (check_leader_health()) {
+ changed = true;
+ }
+ if (check_mutes()) {
+ changed = true;
+ }
+ if (changed) {
+ propose_pending();
+ }
+}
+
+bool HealthMonitor::check_mutes()
+{
+ bool changed = false;
+ auto now = ceph_clock_now();
+ health_check_map_t all;
+ gather_all_health_checks(&all);
+ auto p = pending_mutes.begin();
+ while (p != pending_mutes.end()) {
+ if (p->second.ttl != utime_t() &&
+ p->second.ttl <= now) {
+ mon.clog->info() << "Health alert mute " << p->first
+ << " cleared (passed TTL " << p->second.ttl << ")";
+ p = pending_mutes.erase(p);
+ changed = true;
+ continue;
+ }
+ if (!p->second.sticky) {
+ auto q = all.checks.find(p->first);
+ if (q == all.checks.end()) {
+ mon.clog->info() << "Health alert mute " << p->first
+ << " cleared (health alert cleared)";
+ p = pending_mutes.erase(p);
+ changed = true;
+ continue;
+ }
+ if (p->second.count) {
+ // count-based mute
+ if (q->second.count > p->second.count) {
+ mon.clog->info() << "Health alert mute " << p->first
+ << " cleared (count increased from " << p->second.count
+ << " to " << q->second.count << ")";
+ p = pending_mutes.erase(p);
+ changed = true;
+ continue;
+ }
+ if (q->second.count < p->second.count) {
+ // rachet down the mute
+ dout(10) << __func__ << " mute " << p->first << " count "
+ << p->second.count << " -> " << q->second.count
+ << dendl;
+ p->second.count = q->second.count;
+ changed = true;
+ }
+ } else {
+ // summary-based mute
+ if (p->second.summary != q->second.summary) {
+ mon.clog->info() << "Health alert mute " << p->first
+ << " cleared (summary changed)";
+ p = pending_mutes.erase(p);
+ changed = true;
+ continue;
+ }
+ }
+ }
+ ++p;
+ }
+ return changed;
+}
+
+void HealthMonitor::gather_all_health_checks(health_check_map_t *all)
+{
+ for (auto& svc : mon.paxos_service) {
+ all->merge(svc->get_health_checks());
+ }
+}
+
+health_status_t HealthMonitor::get_health_status(
+ bool want_detail,
+ Formatter *f,
+ std::string *plain,
+ const char *sep1,
+ const char *sep2)
+{
+ health_check_map_t all;
+ gather_all_health_checks(&all);
+ health_status_t r = HEALTH_OK;
+ for (auto& p : all.checks) {
+ if (!mutes.count(p.first)) {
+ if (r > p.second.severity) {
+ r = p.second.severity;
+ }
+ }
+ }
+ if (f) {
+ f->open_object_section("health");
+ f->dump_stream("status") << r;
+ f->open_object_section("checks");
+ for (auto& p : all.checks) {
+ f->open_object_section(p.first.c_str());
+ p.second.dump(f, want_detail);
+ f->dump_bool("muted", mutes.count(p.first));
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("mutes");
+ for (auto& p : mutes) {
+ f->dump_object("mute", p.second);
+ }
+ f->close_section();
+ f->close_section();
+ } else {
+ auto now = ceph_clock_now();
+ // one-liner: HEALTH_FOO[ thing1[; thing2 ...]]
+ string summary;
+ for (auto& p : all.checks) {
+ if (!mutes.count(p.first)) {
+ if (!summary.empty()) {
+ summary += sep2;
+ }
+ summary += p.second.summary;
+ }
+ }
+ *plain = stringify(r);
+ if (summary.size()) {
+ *plain += sep1;
+ *plain += summary;
+ }
+ if (!mutes.empty()) {
+ if (summary.size()) {
+ *plain += sep2;
+ } else {
+ *plain += sep1;
+ }
+ *plain += "(muted:";
+ for (auto& p : mutes) {
+ *plain += " ";
+ *plain += p.first;
+ if (p.second.ttl) {
+ if (p.second.ttl > now) {
+ auto left = p.second.ttl;
+ left -= now;
+ *plain += "("s + utimespan_str(left) + ")";
+ } else {
+ *plain += "(0s)";
+ }
+ }
+ }
+ *plain += ")";
+ }
+ *plain += "\n";
+ // detail
+ if (want_detail) {
+ for (auto& p : all.checks) {
+ auto q = mutes.find(p.first);
+ if (q != mutes.end()) {
+ *plain += "(MUTED";
+ if (q->second.ttl != utime_t()) {
+ if (q->second.ttl > now) {
+ auto left = q->second.ttl;
+ left -= now;
+ *plain += " ttl ";
+ *plain += utimespan_str(left);
+ } else {
+ *plain += "0s";
+ }
+ }
+ if (q->second.sticky) {
+ *plain += ", STICKY";
+ }
+ *plain += ") ";
+ }
+ *plain += "["s + short_health_string(p.second.severity) + "] " +
+ p.first + ": " + p.second.summary + "\n";
+ for (auto& d : p.second.detail) {
+ *plain += " ";
+ *plain += d;
+ *plain += "\n";
+ }
+ }
+ }
+ }
+ return r;
+}
+
+bool HealthMonitor::check_member_health()
+{
+ dout(20) << __func__ << dendl;
+ bool changed = false;
+ const auto max = g_conf().get_val<uint64_t>("mon_health_max_detail");
+
+ // snapshot of usage
+ DataStats stats;
+ get_fs_stats(stats.fs_stats, g_conf()->mon_data.c_str());
+ map<string,uint64_t> extra;
+ uint64_t store_size = mon.store->get_estimated_size(extra);
+ ceph_assert(store_size > 0);
+ stats.store_stats.bytes_total = store_size;
+ stats.store_stats.bytes_sst = extra["sst"];
+ stats.store_stats.bytes_log = extra["log"];
+ stats.store_stats.bytes_misc = extra["misc"];
+ stats.last_update = ceph_clock_now();
+ dout(10) << __func__ << " avail " << stats.fs_stats.avail_percent << "%"
+ << " total " << byte_u_t(stats.fs_stats.byte_total)
+ << ", used " << byte_u_t(stats.fs_stats.byte_used)
+ << ", avail " << byte_u_t(stats.fs_stats.byte_avail) << dendl;
+
+ // MON_DISK_{LOW,CRIT,BIG}
+ health_check_map_t next;
+ if (stats.fs_stats.avail_percent <= g_conf()->mon_data_avail_crit) {
+ stringstream ss, ss2;
+ ss << "mon%plurals% %names% %isorare% very low on available space";
+ auto& d = next.add("MON_DISK_CRIT", HEALTH_ERR, ss.str(), 1);
+ ss2 << "mon." << mon.name << " has " << stats.fs_stats.avail_percent
+ << "% avail";
+ d.detail.push_back(ss2.str());
+ } else if (stats.fs_stats.avail_percent <= g_conf()->mon_data_avail_warn) {
+ stringstream ss, ss2;
+ ss << "mon%plurals% %names% %isorare% low on available space";
+ auto& d = next.add("MON_DISK_LOW", HEALTH_WARN, ss.str(), 1);
+ ss2 << "mon." << mon.name << " has " << stats.fs_stats.avail_percent
+ << "% avail";
+ d.detail.push_back(ss2.str());
+ }
+ if (stats.store_stats.bytes_total >= g_conf()->mon_data_size_warn) {
+ stringstream ss, ss2;
+ ss << "mon%plurals% %names% %isorare% using a lot of disk space";
+ auto& d = next.add("MON_DISK_BIG", HEALTH_WARN, ss.str(), 1);
+ ss2 << "mon." << mon.name << " is "
+ << byte_u_t(stats.store_stats.bytes_total)
+ << " >= mon_data_size_warn ("
+ << byte_u_t(g_conf()->mon_data_size_warn) << ")";
+ d.detail.push_back(ss2.str());
+ }
+
+ // OSD_NO_DOWN_OUT_INTERVAL
+ {
+ // Warn if 'mon_osd_down_out_interval' is set to zero.
+ // Having this option set to zero on the leader acts much like the
+ // 'noout' flag. It's hard to figure out what's going wrong with clusters
+ // without the 'noout' flag set but acting like that just the same, so
+ // we report a HEALTH_WARN in case this option is set to zero.
+ // This is an ugly hack to get the warning out, but until we find a way
+ // to spread global options throughout the mon cluster and have all mons
+ // using a base set of the same options, we need to work around this sort
+ // of things.
+ // There's also the obvious drawback that if this is set on a single
+ // monitor on a 3-monitor cluster, this warning will only be shown every
+ // third monitor connection.
+ if (g_conf()->mon_warn_on_osd_down_out_interval_zero &&
+ g_conf()->mon_osd_down_out_interval == 0) {
+ ostringstream ss, ds;
+ ss << "mon%plurals% %names% %hasorhave% mon_osd_down_out_interval set to 0";
+ auto& d = next.add("OSD_NO_DOWN_OUT_INTERVAL", HEALTH_WARN, ss.str(), 1);
+ ds << "mon." << mon.name << " has mon_osd_down_out_interval set to 0";
+ d.detail.push_back(ds.str());
+ }
+ }
+
+ // AUTH_INSECURE_GLOBAL_ID_RECLAIM
+ if (g_conf().get_val<bool>("mon_warn_on_insecure_global_id_reclaim") &&
+ g_conf().get_val<bool>("auth_allow_insecure_global_id_reclaim")) {
+ // Warn if there are any clients that are insecurely renewing their global_id
+ std::lock_guard l(mon.session_map_lock);
+ list<std::string> detail;
+ for (auto p = mon.session_map.sessions.begin();
+ p != mon.session_map.sessions.end();
+ ++p) {
+ if ((*p)->global_id_status == global_id_status_t::RECLAIM_INSECURE) {
+ ostringstream ds;
+ ds << (*p)->entity_name << " at " << (*p)->addrs
+ << " is using insecure global_id reclaim";
+ detail.push_back(ds.str());
+ if (detail.size() >= max) {
+ detail.push_back("...");
+ break;
+ }
+ }
+ }
+ if (!detail.empty()) {
+ ostringstream ss;
+ ss << "client%plurals% %isorare% using insecure global_id reclaim";
+ auto& d = next.add("AUTH_INSECURE_GLOBAL_ID_RECLAIM", HEALTH_WARN, ss.str(),
+ detail.size());
+ d.detail.swap(detail);
+ }
+ }
+ // AUTH_INSECURE_GLOBAL_ID_RECLAIM_ALLOWED
+ if (g_conf().get_val<bool>("mon_warn_on_insecure_global_id_reclaim_allowed") &&
+ g_conf().get_val<bool>("auth_allow_insecure_global_id_reclaim")) {
+ ostringstream ss, ds;
+ ss << "mon%plurals% %isorare% allowing insecure global_id reclaim";
+ auto& d = next.add("AUTH_INSECURE_GLOBAL_ID_RECLAIM_ALLOWED", HEALTH_WARN, ss.str(), 1);
+ ds << "mon." << mon.name << " has auth_allow_insecure_global_id_reclaim set to true";
+ d.detail.push_back(ds.str());
+ }
+
+ auto p = quorum_checks.find(mon.rank);
+ if (p == quorum_checks.end()) {
+ if (next.empty()) {
+ return false;
+ }
+ } else {
+ if (p->second == next) {
+ return false;
+ }
+ }
+
+ if (mon.is_leader()) {
+ // prepare to propose
+ quorum_checks[mon.rank] = next;
+ changed = true;
+ } else {
+ // tell the leader
+ mon.send_mon_message(new MMonHealthChecks(next), mon.get_leader());
+ }
+
+ return changed;
+}
+
+bool HealthMonitor::check_leader_health()
+{
+ dout(20) << __func__ << dendl;
+ bool changed = false;
+
+ // prune quorum_health
+ {
+ auto& qset = mon.get_quorum();
+ auto p = quorum_checks.begin();
+ while (p != quorum_checks.end()) {
+ if (qset.count(p->first) == 0) {
+ p = quorum_checks.erase(p);
+ changed = true;
+ } else {
+ ++p;
+ }
+ }
+ }
+
+ health_check_map_t next;
+
+ // DAEMON_OLD_VERSION
+ if (g_conf().get_val<bool>("mon_warn_on_older_version")) {
+ check_for_older_version(&next);
+ }
+ // MON_DOWN
+ check_for_mon_down(&next);
+ // MON_CLOCK_SKEW
+ check_for_clock_skew(&next);
+ // MON_MSGR2_NOT_ENABLED
+ if (g_conf().get_val<bool>("mon_warn_on_msgr2_not_enabled")) {
+ check_if_msgr2_enabled(&next);
+ }
+
+ if (next != leader_checks) {
+ changed = true;
+ leader_checks = next;
+ }
+ return changed;
+}
+
+void HealthMonitor::check_for_older_version(health_check_map_t *checks)
+{
+ static ceph::coarse_mono_time old_version_first_time =
+ ceph::coarse_mono_clock::zero();
+
+ auto now = ceph::coarse_mono_clock::now();
+ if (ceph::coarse_mono_clock::is_zero(old_version_first_time)) {
+ old_version_first_time = now;
+ }
+ const auto warn_delay = g_conf().get_val<std::chrono::seconds>("mon_warn_older_version_delay");
+ if (now - old_version_first_time > warn_delay) {
+ std::map<string, std::list<string> > all_versions;
+ mon.get_all_versions(all_versions);
+ if (all_versions.size() > 1) {
+ dout(20) << __func__ << " all_versions=" << all_versions << dendl;
+ // The last entry has the largest version
+ dout(20) << __func__ << " highest version daemon count "
+ << all_versions.rbegin()->second.size() << dendl;
+ // Erase last element (the highest version running)
+ all_versions.erase(all_versions.rbegin()->first);
+ ceph_assert(all_versions.size() > 0);
+ ostringstream ss;
+ unsigned daemon_count = 0;
+ for (auto& g : all_versions) {
+ daemon_count += g.second.size();
+ }
+ int ver_count = all_versions.size();
+ ceph_assert(!(daemon_count == 1 && ver_count != 1));
+ ss << "There " << (daemon_count == 1 ? "is a daemon" : "are daemons")
+ << " running " << (ver_count > 1 ? "multiple old versions" : "an older version") << " of ceph";
+ health_status_t status;
+ if (ver_count > 1)
+ status = HEALTH_ERR;
+ else
+ status = HEALTH_WARN;
+ auto& d = checks->add("DAEMON_OLD_VERSION", status, ss.str(), all_versions.size());
+ for (auto& g : all_versions) {
+ ostringstream ds;
+ for (auto& i : g.second) { // Daemon list
+ ds << i << " ";
+ }
+ ds << (g.second.size() == 1 ? "is" : "are")
+ << " running an older version of ceph: " << g.first;
+ d.detail.push_back(ds.str());
+ }
+ } else {
+ old_version_first_time = ceph::coarse_mono_clock::zero();
+ }
+ }
+}
+
+void HealthMonitor::check_for_mon_down(health_check_map_t *checks)
+{
+ int max = mon.monmap->size();
+ int actual = mon.get_quorum().size();
+ const auto rcnow = ceph::real_clock::now();
+ const auto created = mon.monmap->created.to_real_time();
+ const auto mcnow = ceph::coarse_mono_clock::now();
+ const auto starttime = mon.get_starttime();
+
+ if (actual < max &&
+ (rcnow - created) > g_conf().get_val<std::chrono::seconds>("mon_down_mkfs_grace") &&
+ (mcnow - starttime) > g_conf().get_val<std::chrono::seconds>("mon_down_uptime_grace")) {
+ ostringstream ss;
+ ss << (max-actual) << "/" << max << " mons down, quorum "
+ << mon.get_quorum_names();
+ auto& d = checks->add("MON_DOWN", HEALTH_WARN, ss.str(), max - actual);
+ set<int> q = mon.get_quorum();
+ for (int i=0; i<max; i++) {
+ if (q.count(i) == 0) {
+ ostringstream ss;
+ ss << "mon." << mon.monmap->get_name(i) << " (rank " << i
+ << ") addr " << mon.monmap->get_addrs(i)
+ << " is down (out of quorum)";
+ d.detail.push_back(ss.str());
+ }
+ }
+ }
+}
+
+void HealthMonitor::check_for_clock_skew(health_check_map_t *checks)
+{
+ if (!mon.timecheck_skews.empty()) {
+ list<string> warns;
+ list<string> details;
+ for (auto& i : mon.timecheck_skews) {
+ double skew = i.second;
+ double latency = mon.timecheck_latencies[i.first];
+ string name = mon.monmap->get_name(i.first);
+ ostringstream tcss;
+ health_status_t tcstatus = mon.timecheck_status(tcss, skew, latency);
+ if (tcstatus != HEALTH_OK) {
+ warns.push_back(name);
+ ostringstream tmp_ss;
+ tmp_ss << "mon." << name << " " << tcss.str()
+ << " (latency " << latency << "s)";
+ details.push_back(tmp_ss.str());
+ }
+ }
+ if (!warns.empty()) {
+ ostringstream ss;
+ ss << "clock skew detected on";
+ while (!warns.empty()) {
+ ss << " mon." << warns.front();
+ warns.pop_front();
+ if (!warns.empty())
+ ss << ",";
+ }
+ auto& d = checks->add("MON_CLOCK_SKEW", HEALTH_WARN, ss.str(), details.size());
+ d.detail.swap(details);
+ }
+ }
+}
+
+void HealthMonitor::check_if_msgr2_enabled(health_check_map_t *checks)
+{
+ if (g_conf().get_val<bool>("ms_bind_msgr2") &&
+ mon.monmap->get_required_features().contains_all(
+ ceph::features::mon::FEATURE_NAUTILUS)) {
+ list<string> details;
+ for (auto& i : mon.monmap->mon_info) {
+ if (!i.second.public_addrs.has_msgr2()) {
+ ostringstream ds;
+ ds << "mon." << i.first << " is not bound to a msgr2 port, only "
+ << i.second.public_addrs;
+ details.push_back(ds.str());
+ }
+ }
+ if (!details.empty()) {
+ ostringstream ss;
+ ss << details.size() << " monitors have not enabled msgr2";
+ auto &d = checks->add("MON_MSGR2_NOT_ENABLED", HEALTH_WARN, ss.str(),
+ details.size());
+ d.detail.swap(details);
+ }
+ }
+}
diff --git a/src/mon/HealthMonitor.h b/src/mon/HealthMonitor.h
new file mode 100644
index 000000000..c0e79d033
--- /dev/null
+++ b/src/mon/HealthMonitor.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_HEALTH_MONITOR_H
+#define CEPH_HEALTH_MONITOR_H
+
+#include "mon/PaxosService.h"
+
+class HealthMonitor : public PaxosService
+{
+ version_t version = 0;
+ std::map<int,health_check_map_t> quorum_checks; // for each quorum member
+ health_check_map_t leader_checks; // leader only
+ std::map<std::string,health_mute_t> mutes;
+
+ std::map<std::string,health_mute_t> pending_mutes;
+
+public:
+ HealthMonitor(Monitor &m, Paxos &p, const std::string& service_name);
+
+ /**
+ * @defgroup HealthMonitor_Inherited_h Inherited abstract methods
+ * @{
+ */
+ void init() override;
+
+ bool preprocess_query(MonOpRequestRef op) override;
+ bool prepare_update(MonOpRequestRef op) override;
+
+ void create_initial() override;
+ void update_from_paxos(bool *need_bootstrap) override;
+ void create_pending() override;
+ void encode_pending(MonitorDBStore::TransactionRef t) override;
+ version_t get_trim_to() const override;
+
+ void encode_full(MonitorDBStore::TransactionRef t) override { }
+
+ void tick() override;
+
+ void gather_all_health_checks(health_check_map_t *all);
+ health_status_t get_health_status(
+ bool want_detail,
+ ceph::Formatter *f,
+ std::string *plain,
+ const char *sep1 = " ",
+ const char *sep2 = "; ");
+
+ /**
+ * @} // HealthMonitor_Inherited_h
+ */
+private:
+ bool preprocess_command(MonOpRequestRef op);
+
+ bool prepare_command(MonOpRequestRef op);
+ bool prepare_health_checks(MonOpRequestRef op);
+ void check_for_older_version(health_check_map_t *checks);
+ void check_for_mon_down(health_check_map_t *checks);
+ void check_for_clock_skew(health_check_map_t *checks);
+ void check_if_msgr2_enabled(health_check_map_t *checks);
+ bool check_leader_health();
+ bool check_member_health();
+ bool check_mutes();
+};
+
+#endif // CEPH_HEALTH_MONITOR_H
diff --git a/src/mon/KVMonitor.cc b/src/mon/KVMonitor.cc
new file mode 100644
index 000000000..21a18db09
--- /dev/null
+++ b/src/mon/KVMonitor.cc
@@ -0,0 +1,533 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "mon/Monitor.h"
+#include "mon/KVMonitor.h"
+#include "include/stringify.h"
+#include "messages/MKVData.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, this)
+
+using std::ostream;
+using std::ostringstream;
+using std::set;
+using std::string;
+using std::stringstream;
+
+static ostream& _prefix(std::ostream *_dout, const Monitor &mon,
+ const KVMonitor *hmon) {
+ return *_dout << "mon." << mon.name << "@" << mon.rank
+ << "(" << mon.get_state_name() << ").kv ";
+}
+
+const string KV_PREFIX = "mon_config_key";
+
+const int MAX_HISTORY = 50;
+
+
+static bool is_binary_string(const string& s)
+{
+ for (auto c : s) {
+ // \n and \t are escaped in JSON; other control characters are not.
+ if ((c < 0x20 && c != '\n' && c != '\t') || c >= 0x7f) {
+ return true;
+ }
+ }
+ return false;
+}
+
+
+KVMonitor::KVMonitor(Monitor &m, Paxos &p, const string& service_name)
+ : PaxosService(m, p, service_name) {
+}
+
+void KVMonitor::init()
+{
+ dout(10) << __func__ << dendl;
+}
+
+void KVMonitor::create_initial()
+{
+ dout(10) << __func__ << dendl;
+ version = 0;
+ pending.clear();
+}
+
+void KVMonitor::update_from_paxos(bool *need_bootstrap)
+{
+ if (version == get_last_committed()) {
+ return;
+ }
+ version = get_last_committed();
+ dout(10) << __func__ << " " << version << dendl;
+ check_all_subs();
+}
+
+void KVMonitor::create_pending()
+{
+ dout(10) << " " << version << dendl;
+ pending.clear();
+}
+
+void KVMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+ dout(10) << " " << (version+1) << dendl;
+ put_last_committed(t, version+1);
+
+ // record the delta for this commit point
+ bufferlist bl;
+ encode(pending, bl);
+ put_version(t, version+1, bl);
+
+ // make actual changes
+ for (auto& p : pending) {
+ string key = p.first;
+ if (p.second) {
+ dout(20) << __func__ << " set " << key << dendl;
+ t->put(KV_PREFIX, key, *p.second);
+ } else {
+ dout(20) << __func__ << " rm " << key << dendl;
+ t->erase(KV_PREFIX, key);
+ }
+ }
+}
+
+version_t KVMonitor::get_trim_to() const
+{
+ // we don't need that many old states, but keep a few
+ if (version > MAX_HISTORY) {
+ return version - MAX_HISTORY;
+ }
+ return 0;
+}
+
+void KVMonitor::get_store_prefixes(set<string>& s) const
+{
+ s.insert(service_name);
+ s.insert(KV_PREFIX);
+}
+
+void KVMonitor::tick()
+{
+ if (!is_active() || !mon.is_leader()) {
+ return;
+ }
+ dout(10) << __func__ << dendl;
+}
+
+void KVMonitor::on_active()
+{
+}
+
+
+bool KVMonitor::preprocess_query(MonOpRequestRef op)
+{
+ switch (op->get_req()->get_type()) {
+ case MSG_MON_COMMAND:
+ try {
+ return preprocess_command(op);
+ } catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
+ }
+ return false;
+}
+
+bool KVMonitor::preprocess_command(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonCommand>();
+ std::stringstream ss;
+ int err = 0;
+
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, get_last_committed());
+ return true;
+ }
+ string format = cmd_getval_or<string>(cmdmap, "format", "plain");
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+ string prefix;
+ cmd_getval(cmdmap, "prefix", prefix);
+ string key;
+ cmd_getval(cmdmap, "key", key);
+
+ bufferlist odata;
+
+ if (prefix == "config-key get") {
+ err = mon.store->get(KV_PREFIX, key, odata);
+ }
+ else if (prefix == "config-key exists") {
+ bool exists = mon.store->exists(KV_PREFIX, key);
+ ss << "key '" << key << "'";
+ if (exists) {
+ ss << " exists";
+ err = 0;
+ } else {
+ ss << " doesn't exist";
+ err = -ENOENT;
+ }
+ }
+ else if (prefix == "config-key list" ||
+ prefix == "config-key ls") {
+ if (!f) {
+ f.reset(Formatter::create("json-pretty"));
+ }
+ KeyValueDB::Iterator iter = mon.store->get_iterator(KV_PREFIX);
+ f->open_array_section("keys");
+ while (iter->valid()) {
+ string key(iter->key());
+ f->dump_string("key", key);
+ iter->next();
+ }
+ f->close_section();
+
+ stringstream tmp_ss;
+ f->flush(tmp_ss);
+ odata.append(tmp_ss);
+ err = 0;
+ }
+ else if (prefix == "config-key dump") {
+ if (!f) {
+ f.reset(Formatter::create("json-pretty"));
+ }
+
+ KeyValueDB::Iterator iter = mon.store->get_iterator(KV_PREFIX);
+ if (key.size()) {
+ iter->lower_bound(key);
+ }
+ f->open_object_section("config-key store");
+ while (iter->valid()) {
+ if (key.size() &&
+ iter->key().find(key) != 0) {
+ break;
+ }
+ string s = iter->value().to_str();
+ if (is_binary_string(s)) {
+ ostringstream ss;
+ ss << "<<< binary blob of length " << s.size() << " >>>";
+ f->dump_string(iter->key().c_str(), ss.str());
+ } else {
+ f->dump_string(iter->key().c_str(), s);
+ }
+ iter->next();
+ }
+ f->close_section();
+
+ stringstream tmp_ss;
+ f->flush(tmp_ss);
+ odata.append(tmp_ss);
+ err = 0;
+ }
+ else {
+ return false;
+ }
+
+ mon.reply_command(op, err, ss.str(), odata, get_last_committed());
+ return true;
+}
+
+bool KVMonitor::prepare_update(MonOpRequestRef op)
+{
+ Message *m = op->get_req();
+ dout(7) << "prepare_update " << *m
+ << " from " << m->get_orig_source_inst() << dendl;
+ switch (m->get_type()) {
+ case MSG_MON_COMMAND:
+ try {
+ return prepare_command(op);
+ } catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
+ }
+ return false;
+}
+
+
+bool KVMonitor::prepare_command(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonCommand>();
+ std::stringstream ss;
+ int err = 0;
+ bufferlist odata;
+
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, get_last_committed());
+ return true;
+ }
+
+ string prefix;
+ cmd_getval(cmdmap, "prefix", prefix);
+ string key;
+ if (!cmd_getval(cmdmap, "key", key)) {
+ err = -EINVAL;
+ ss << "must specify a key";
+ goto reply;
+ }
+
+
+ if (prefix == "config-key set" ||
+ prefix == "config-key put") {
+ bufferlist data;
+ string val;
+ if (cmd_getval(cmdmap, "val", val)) {
+ // they specified a value in the command instead of a file
+ data.append(val);
+ } else if (m->get_data_len() > 0) {
+ // they specified '-i <file>'
+ data = m->get_data();
+ }
+ if (data.length() > (size_t) g_conf()->mon_config_key_max_entry_size) {
+ err = -EFBIG; // File too large
+ ss << "error: entry size limited to "
+ << g_conf()->mon_config_key_max_entry_size << " bytes. "
+ << "Use 'mon config key max entry size' to manually adjust";
+ goto reply;
+ }
+
+ ss << "set " << key;
+ pending[key] = data;
+ goto update;
+ }
+ else if (prefix == "config-key del" ||
+ prefix == "config-key rm") {
+ ss << "key deleted";
+ pending[key].reset();
+ goto update;
+ }
+ else {
+ ss << "unknown command " << prefix;
+ err = -EINVAL;
+ }
+
+reply:
+ mon.reply_command(op, err, ss.str(), odata, get_last_committed());
+ return false;
+
+update:
+ // see if there is an actual change
+ if (pending.empty()) {
+ err = 0;
+ goto reply;
+ }
+ force_immediate_propose(); // faster response
+ wait_for_commit(
+ op,
+ new Monitor::C_Command(
+ mon, op, 0, ss.str(), odata,
+ get_last_committed() + 1));
+ return true;
+}
+
+
+
+
+static string _get_dmcrypt_prefix(const uuid_d& uuid, const string k)
+{
+ return "dm-crypt/osd/" + stringify(uuid) + "/" + k;
+}
+
+bool KVMonitor::_have_prefix(const string &prefix)
+{
+ KeyValueDB::Iterator iter = mon.store->get_iterator(KV_PREFIX);
+
+ while (iter->valid()) {
+ string key(iter->key());
+ size_t p = key.find(prefix);
+ if (p != string::npos && p == 0) {
+ return true;
+ }
+ iter->next();
+ }
+ return false;
+}
+
+int KVMonitor::validate_osd_destroy(
+ const int32_t id,
+ const uuid_d& uuid)
+{
+ string dmcrypt_prefix = _get_dmcrypt_prefix(uuid, "");
+ string daemon_prefix =
+ "daemon-private/osd." + stringify(id) + "/";
+
+ if (!_have_prefix(dmcrypt_prefix) &&
+ !_have_prefix(daemon_prefix)) {
+ return -ENOENT;
+ }
+ return 0;
+}
+
+void KVMonitor::do_osd_destroy(int32_t id, uuid_d& uuid)
+{
+ ceph_assert(is_writeable());
+
+ string dmcrypt_prefix = _get_dmcrypt_prefix(uuid, "");
+ string daemon_prefix =
+ "daemon-private/osd." + stringify(id) + "/";
+
+ for (auto& prefix : { dmcrypt_prefix, daemon_prefix }) {
+ KeyValueDB::Iterator iter = mon.store->get_iterator(KV_PREFIX);
+ iter->lower_bound(prefix);
+ if (iter->key().find(prefix) != 0) {
+ break;
+ }
+ pending[iter->key()].reset();
+ }
+
+ propose_pending();
+}
+
+int KVMonitor::validate_osd_new(
+ const uuid_d& uuid,
+ const string& dmcrypt_key,
+ stringstream& ss)
+{
+ string dmcrypt_prefix = _get_dmcrypt_prefix(uuid, "luks");
+ bufferlist value;
+ value.append(dmcrypt_key);
+
+ if (mon.store->exists(KV_PREFIX, dmcrypt_prefix)) {
+ bufferlist existing_value;
+ int err = mon.store->get(KV_PREFIX, dmcrypt_prefix, existing_value);
+ if (err < 0) {
+ dout(10) << __func__ << " unable to get dm-crypt key from store (r = "
+ << err << ")" << dendl;
+ return err;
+ }
+ if (existing_value.contents_equal(value)) {
+ // both values match; this will be an idempotent op.
+ return EEXIST;
+ }
+ ss << "dm-crypt key already exists and does not match";
+ return -EEXIST;
+ }
+ return 0;
+}
+
+void KVMonitor::do_osd_new(
+ const uuid_d& uuid,
+ const string& dmcrypt_key)
+{
+ ceph_assert(paxos.is_plugged());
+ ceph_assert(is_writeable());
+
+ string dmcrypt_key_prefix = _get_dmcrypt_prefix(uuid, "luks");
+ bufferlist dmcrypt_key_value;
+ dmcrypt_key_value.append(dmcrypt_key);
+
+ pending[dmcrypt_key_prefix] = dmcrypt_key_value;
+
+ propose_pending();
+}
+
+
+void KVMonitor::check_sub(MonSession *s)
+{
+ if (!s->authenticated) {
+ dout(20) << __func__ << " not authenticated " << s->entity_name << dendl;
+ return;
+ }
+ for (auto& p : s->sub_map) {
+ if (p.first.find("kv:") == 0) {
+ check_sub(p.second);
+ }
+ }
+}
+
+void KVMonitor::check_sub(Subscription *sub)
+{
+ dout(10) << __func__
+ << " next " << sub->next
+ << " have " << version << dendl;
+ if (sub->next <= version) {
+ maybe_send_update(sub);
+ if (sub->onetime) {
+ mon.with_session_map([sub](MonSessionMap& session_map) {
+ session_map.remove_sub(sub);
+ });
+ }
+ }
+}
+
+void KVMonitor::check_all_subs()
+{
+ dout(10) << __func__ << dendl;
+ int updated = 0, total = 0;
+ for (auto& i : mon.session_map.subs) {
+ if (i.first.find("kv:") == 0) {
+ auto p = i.second->begin();
+ while (!p.end()) {
+ auto sub = *p;
+ ++p;
+ ++total;
+ if (maybe_send_update(sub)) {
+ ++updated;
+ }
+ }
+ }
+ }
+ dout(10) << __func__ << " updated " << updated << " / " << total << dendl;
+}
+
+bool KVMonitor::maybe_send_update(Subscription *sub)
+{
+ if (sub->next > version) {
+ return false;
+ }
+
+ auto m = new MKVData;
+ m->prefix = sub->type.substr(3);
+ m->version = version;
+
+ if (sub->next && sub->next > get_first_committed()) {
+ // incremental
+ m->incremental = true;
+
+ for (version_t cur = sub->next; cur <= version; ++cur) {
+ bufferlist bl;
+ int err = get_version(cur, bl);
+ ceph_assert(err == 0);
+
+ std::map<std::string,std::optional<ceph::buffer::list>> pending;
+ auto p = bl.cbegin();
+ ceph::decode(pending, p);
+
+ for (auto& i : pending) {
+ if (i.first.find(m->prefix) == 0) {
+ m->data[i.first] = i.second;
+ }
+ }
+ }
+
+ dout(10) << __func__ << " incremental keys for " << m->prefix
+ << ", v " << sub->next << ".." << version
+ << ", " << m->data.size() << " keys"
+ << dendl;
+ } else {
+ m->incremental = false;
+
+ KeyValueDB::Iterator iter = mon.store->get_iterator(KV_PREFIX);
+ iter->lower_bound(m->prefix);
+ while (iter->valid() &&
+ iter->key().find(m->prefix) == 0) {
+ m->data[iter->key()] = iter->value();
+ iter->next();
+ }
+
+ dout(10) << __func__ << " sending full dump of " << m->prefix
+ << ", " << m->data.size() << " keys"
+ << dendl;
+ }
+ sub->session->con->send_message(m);
+ sub->next = version + 1;
+ return true;
+}
diff --git a/src/mon/KVMonitor.h b/src/mon/KVMonitor.h
new file mode 100644
index 000000000..8171ad34f
--- /dev/null
+++ b/src/mon/KVMonitor.h
@@ -0,0 +1,69 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <optional>
+
+#include "mon/PaxosService.h"
+
+class MonSession;
+
+extern const std::string KV_PREFIX;
+
+class KVMonitor : public PaxosService
+{
+ version_t version = 0;
+ std::map<std::string,std::optional<ceph::buffer::list>> pending;
+
+ bool _have_prefix(const std::string &prefix);
+
+public:
+ KVMonitor(Monitor &m, Paxos &p, const std::string& service_name);
+
+ void init() override;
+
+ void get_store_prefixes(std::set<std::string>& s) const override;
+
+ bool preprocess_command(MonOpRequestRef op);
+ bool prepare_command(MonOpRequestRef op);
+
+ bool preprocess_query(MonOpRequestRef op) override;
+ bool prepare_update(MonOpRequestRef op) override;
+
+ void create_initial() override;
+ void update_from_paxos(bool *need_bootstrap) override;
+ void create_pending() override;
+ void encode_pending(MonitorDBStore::TransactionRef t) override;
+ version_t get_trim_to() const override;
+
+ void encode_full(MonitorDBStore::TransactionRef t) override { }
+
+ void on_active() override;
+ void tick() override;
+
+ int validate_osd_destroy(const int32_t id, const uuid_d& uuid);
+ void do_osd_destroy(int32_t id, uuid_d& uuid);
+ int validate_osd_new(
+ const uuid_d& uuid,
+ const std::string& dmcrypt_key,
+ std::stringstream& ss);
+ void do_osd_new(const uuid_d& uuid, const std::string& dmcrypt_key);
+
+ void check_sub(MonSession *s);
+ void check_sub(Subscription *sub);
+ void check_all_subs();
+
+ bool maybe_send_update(Subscription *sub);
+
+
+ // used by other services to adjust kv content; note that callers MUST ensure that
+ // propose_pending() is called and a commit is forced to provide atomicity and
+ // proper subscriber notifications.
+ void enqueue_set(const std::string& key, bufferlist &v) {
+ pending[key] = v;
+ }
+ void enqueue_rm(const std::string& key) {
+ pending[key].reset();
+ }
+};
diff --git a/src/mon/LogMonitor.cc b/src/mon/LogMonitor.cc
new file mode 100644
index 000000000..ad18a3aa1
--- /dev/null
+++ b/src/mon/LogMonitor.cc
@@ -0,0 +1,1293 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+/*
+
+ -- Storage scheme --
+
+ Pre-quincy:
+
+ - LogSummary contains last N entries for every channel
+ - LogSummary (as "full") written on every commit
+ - LogSummary contains "keys" which LogEntryKey hash_set for the
+ same set of entries (for deduping)
+
+ Quincy+:
+
+ - LogSummary contains, for each channel,
+ - start seq
+ - end seq (last written seq + 1)
+ - LogSummary contains an LRUSet for tracking dups
+ - LogSummary written every N commits
+ - each LogEntry written in a separate key
+ - "%s/%08x" % (channel, seq) -> LogEntry
+ - per-commit record includes channel -> begin (trim bounds)
+ - 'external_log_to' meta records version to which we have logged externally
+
+*/
+
+
+
+#include <boost/algorithm/string/predicate.hpp>
+
+#include <iterator>
+#include <sstream>
+#include <syslog.h>
+
+#include "LogMonitor.h"
+#include "Monitor.h"
+#include "MonitorDBStore.h"
+
+#include "messages/MMonCommand.h"
+#include "messages/MLog.h"
+#include "messages/MLogAck.h"
+#include "common/Graylog.h"
+#include "common/Journald.h"
+#include "common/errno.h"
+#include "common/strtol.h"
+#include "include/ceph_assert.h"
+#include "include/str_list.h"
+#include "include/str_map.h"
+#include "include/compat.h"
+#include "include/utime_fmt.h"
+
+#define dout_subsys ceph_subsys_mon
+
+using namespace TOPNSPC::common;
+
+using std::cerr;
+using std::cout;
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::multimap;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::setfill;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+using std::unique_ptr;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_message;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+
+string LogMonitor::log_channel_info::get_log_file(const string &channel)
+{
+ dout(25) << __func__ << " for channel '"
+ << channel << "'" << dendl;
+
+ if (expanded_log_file.count(channel) == 0) {
+ string fname = expand_channel_meta(
+ get_str_map_key(log_file, channel, &CLOG_CONFIG_DEFAULT_KEY),
+ channel);
+ expanded_log_file[channel] = fname;
+
+ dout(20) << __func__ << " for channel '"
+ << channel << "' expanded to '"
+ << fname << "'" << dendl;
+ }
+ return expanded_log_file[channel];
+}
+
+
+void LogMonitor::log_channel_info::expand_channel_meta(map<string,string> &m)
+{
+ dout(20) << __func__ << " expand map: " << m << dendl;
+ for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p) {
+ m[p->first] = expand_channel_meta(p->second, p->first);
+ }
+ dout(20) << __func__ << " expanded map: " << m << dendl;
+}
+
+string LogMonitor::log_channel_info::expand_channel_meta(
+ const string &input,
+ const string &change_to)
+{
+ size_t pos = string::npos;
+ string s(input);
+ while ((pos = s.find(LOG_META_CHANNEL)) != string::npos) {
+ string tmp = s.substr(0, pos) + change_to;
+ if (pos+LOG_META_CHANNEL.length() < s.length())
+ tmp += s.substr(pos+LOG_META_CHANNEL.length());
+ s = tmp;
+ }
+ dout(20) << __func__ << " from '" << input
+ << "' to '" << s << "'" << dendl;
+
+ return s;
+}
+
+bool LogMonitor::log_channel_info::do_log_to_syslog(const string &channel) {
+ string v = get_str_map_key(log_to_syslog, channel,
+ &CLOG_CONFIG_DEFAULT_KEY);
+ // We expect booleans, but they are in k/v pairs, kept
+ // as strings, in 'log_to_syslog'. We must ensure
+ // compatibility with existing boolean handling, and so
+ // we are here using a modified version of how
+ // md_config_t::set_val_raw() handles booleans. We will
+ // accept both 'true' and 'false', but will also check for
+ // '1' and '0'. The main distiction between this and the
+ // original code is that we will assume everything not '1',
+ // '0', 'true' or 'false' to be 'false'.
+ bool ret = false;
+
+ if (boost::iequals(v, "false")) {
+ ret = false;
+ } else if (boost::iequals(v, "true")) {
+ ret = true;
+ } else {
+ std::string err;
+ int b = strict_strtol(v.c_str(), 10, &err);
+ ret = (err.empty() && b == 1);
+ }
+
+ return ret;
+}
+
+ceph::logging::Graylog::Ref LogMonitor::log_channel_info::get_graylog(
+ const string &channel)
+{
+ dout(25) << __func__ << " for channel '"
+ << channel << "'" << dendl;
+
+ if (graylogs.count(channel) == 0) {
+ auto graylog(std::make_shared<ceph::logging::Graylog>("mon"));
+
+ graylog->set_fsid(g_conf().get_val<uuid_d>("fsid"));
+ graylog->set_hostname(g_conf()->host);
+ graylog->set_destination(get_str_map_key(log_to_graylog_host, channel,
+ &CLOG_CONFIG_DEFAULT_KEY),
+ atoi(get_str_map_key(log_to_graylog_port, channel,
+ &CLOG_CONFIG_DEFAULT_KEY).c_str()));
+
+ graylogs[channel] = graylog;
+ dout(20) << __func__ << " for channel '"
+ << channel << "' to graylog host '"
+ << log_to_graylog_host[channel] << ":"
+ << log_to_graylog_port[channel]
+ << "'" << dendl;
+ }
+ return graylogs[channel];
+}
+
+ceph::logging::JournaldClusterLogger &LogMonitor::log_channel_info::get_journald()
+{
+ dout(25) << __func__ << dendl;
+
+ if (!journald) {
+ journald = std::make_unique<ceph::logging::JournaldClusterLogger>();
+ }
+ return *journald;
+}
+
+void LogMonitor::log_channel_info::clear()
+{
+ log_to_syslog.clear();
+ syslog_level.clear();
+ syslog_facility.clear();
+ log_file.clear();
+ expanded_log_file.clear();
+ log_file_level.clear();
+ log_to_graylog.clear();
+ log_to_graylog_host.clear();
+ log_to_graylog_port.clear();
+ log_to_journald.clear();
+ graylogs.clear();
+ journald.reset();
+}
+
+LogMonitor::log_channel_info::log_channel_info() = default;
+LogMonitor::log_channel_info::~log_channel_info() = default;
+
+
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, get_last_committed())
+static ostream& _prefix(std::ostream *_dout, Monitor &mon, version_t v) {
+ return *_dout << "mon." << mon.name << "@" << mon.rank
+ << "(" << mon.get_state_name()
+ << ").log v" << v << " ";
+}
+
+ostream& operator<<(ostream &out, const LogMonitor &pm)
+{
+ return out << "log";
+}
+
+/*
+ Tick function to update the map based on performance every N seconds
+*/
+
+void LogMonitor::tick()
+{
+ if (!is_active()) return;
+
+ dout(10) << *this << dendl;
+
+}
+
+void LogMonitor::create_initial()
+{
+ dout(10) << "create_initial -- creating initial map" << dendl;
+ LogEntry e;
+ e.name = g_conf()->name;
+ e.rank = entity_name_t::MON(mon.rank);
+ e.addrs = mon.messenger->get_myaddrs();
+ e.stamp = ceph_clock_now();
+ e.prio = CLOG_INFO;
+ e.channel = CLOG_CHANNEL_CLUSTER;
+ std::stringstream ss;
+ ss << "mkfs " << mon.monmap->get_fsid();
+ e.msg = ss.str();
+ e.seq = 0;
+ pending_log.insert(pair<utime_t,LogEntry>(e.stamp, e));
+}
+
+void LogMonitor::update_from_paxos(bool *need_bootstrap)
+{
+ dout(10) << __func__ << dendl;
+ version_t version = get_last_committed();
+ dout(10) << __func__ << " version " << version
+ << " summary v " << summary.version << dendl;
+
+ log_external_backlog();
+
+ if (version == summary.version)
+ return;
+ ceph_assert(version >= summary.version);
+
+ version_t latest_full = get_version_latest_full();
+ dout(10) << __func__ << " latest full " << latest_full << dendl;
+ if ((latest_full > 0) && (latest_full > summary.version)) {
+ bufferlist latest_bl;
+ get_version_full(latest_full, latest_bl);
+ ceph_assert(latest_bl.length() != 0);
+ dout(7) << __func__ << " loading summary e" << latest_full << dendl;
+ auto p = latest_bl.cbegin();
+ decode(summary, p);
+ dout(7) << __func__ << " loaded summary e" << summary.version << dendl;
+ }
+
+ // walk through incrementals
+ while (version > summary.version) {
+ bufferlist bl;
+ int err = get_version(summary.version+1, bl);
+ ceph_assert(err == 0);
+ ceph_assert(bl.length());
+
+ auto p = bl.cbegin();
+ __u8 struct_v;
+ decode(struct_v, p);
+ if (struct_v == 1) {
+ // legacy pre-quincy commits
+ while (!p.end()) {
+ LogEntry le;
+ le.decode(p);
+ dout(7) << "update_from_paxos applying incremental log "
+ << summary.version+1 << " " << le << dendl;
+ summary.add_legacy(le);
+ }
+ } else {
+ uint32_t num;
+ decode(num, p);
+ while (num--) {
+ LogEntry le;
+ le.decode(p);
+ dout(7) << "update_from_paxos applying incremental log "
+ << summary.version+1 << " " << le << dendl;
+ summary.recent_keys.insert(le.key());
+ summary.channel_info[le.channel].second++;
+ // we may have logged past the (persisted) summary in a prior quorum
+ if (version > external_log_to) {
+ log_external(le);
+ }
+ }
+ map<string,version_t> prune_channels_to;
+ decode(prune_channels_to, p);
+ for (auto& [channel, prune_to] : prune_channels_to) {
+ dout(20) << __func__ << " channel " << channel
+ << " pruned to " << prune_to << dendl;
+ summary.channel_info[channel].first = prune_to;
+ }
+ // zero out pre-quincy fields (encode_pending needs this to reliably detect
+ // upgrade)
+ summary.tail_by_channel.clear();
+ summary.keys.clear();
+ }
+
+ summary.version++;
+ summary.prune(g_conf()->mon_log_max_summary);
+ }
+ dout(10) << " summary.channel_info " << summary.channel_info << dendl;
+ external_log_to = version;
+ mon.store->write_meta("external_log_to", stringify(external_log_to));
+
+ check_subs();
+}
+
+void LogMonitor::log_external(const LogEntry& le)
+{
+ string channel = le.channel;
+ if (channel.empty()) { // keep retrocompatibility
+ channel = CLOG_CHANNEL_CLUSTER;
+ }
+
+ if (channels.do_log_to_syslog(channel)) {
+ string level = channels.get_level(channel);
+ string facility = channels.get_facility(channel);
+ if (level.empty() || facility.empty()) {
+ derr << __func__ << " unable to log to syslog -- level or facility"
+ << " not defined (level: " << level << ", facility: "
+ << facility << ")" << dendl;
+ } else {
+ le.log_to_syslog(channels.get_level(channel),
+ channels.get_facility(channel));
+ }
+ }
+
+ if (channels.do_log_to_graylog(channel)) {
+ ceph::logging::Graylog::Ref graylog = channels.get_graylog(channel);
+ if (graylog) {
+ graylog->log_log_entry(&le);
+ }
+ dout(7) << "graylog: " << channel << " " << graylog
+ << " host:" << channels.log_to_graylog_host << dendl;
+ }
+
+ if (channels.do_log_to_journald(channel)) {
+ auto &journald = channels.get_journald();
+ journald.log_log_entry(le);
+ dout(7) << "journald: " << channel << dendl;
+ }
+
+ bool do_stderr = g_conf().get_val<bool>("mon_cluster_log_to_stderr");
+ int fd = -1;
+ if (g_conf()->mon_cluster_log_to_file) {
+ if (this->log_rotated.exchange(false)) {
+ this->log_external_close_fds();
+ }
+
+ auto p = channel_fds.find(channel);
+ if (p == channel_fds.end()) {
+ string log_file = channels.get_log_file(channel);
+ dout(20) << __func__ << " logging for channel '" << channel
+ << "' to file '" << log_file << "'" << dendl;
+ if (!log_file.empty()) {
+ fd = ::open(log_file.c_str(), O_WRONLY|O_APPEND|O_CREAT|O_CLOEXEC, 0600);
+ if (fd < 0) {
+ int err = -errno;
+ dout(1) << "unable to write to '" << log_file << "' for channel '"
+ << channel << "': " << cpp_strerror(err) << dendl;
+ } else {
+ channel_fds[channel] = fd;
+ }
+ }
+ } else {
+ fd = p->second;
+ }
+ }
+ if (do_stderr || fd >= 0) {
+ fmt::format_to(std::back_inserter(log_buffer), "{}\n", le);
+
+ if (fd >= 0) {
+ int err = safe_write(fd, log_buffer.data(), log_buffer.size());
+ if (err < 0) {
+ dout(1) << "error writing to '" << channels.get_log_file(channel)
+ << "' for channel '" << channel
+ << ": " << cpp_strerror(err) << dendl;
+ ::close(fd);
+ channel_fds.erase(channel);
+ }
+ }
+
+ if (do_stderr) {
+ fmt::print(std::cerr, "{} {}", channel, std::string_view(log_buffer.data(), log_buffer.size()));
+ }
+
+ log_buffer.clear();
+ }
+}
+
+void LogMonitor::log_external_close_fds()
+{
+ for (auto& [channel, fd] : channel_fds) {
+ if (fd >= 0) {
+ dout(10) << __func__ << " closing " << channel << " (" << fd << ")" << dendl;
+ ::close(fd);
+ }
+ }
+ channel_fds.clear();
+}
+
+/// catch external logs up to summary.version
+void LogMonitor::log_external_backlog()
+{
+ if (!external_log_to) {
+ std::string cur_str;
+ int r = mon.store->read_meta("external_log_to", &cur_str);
+ if (r == 0) {
+ external_log_to = std::stoull(cur_str);
+ dout(10) << __func__ << " initialized external_log_to = " << external_log_to
+ << " (recorded log_to position)" << dendl;
+ } else {
+ // pre-quincy, we assumed that anything through summary.version was
+ // logged externally.
+ assert(r == -ENOENT);
+ external_log_to = summary.version;
+ dout(10) << __func__ << " initialized external_log_to = " << external_log_to
+ << " (summary v " << summary.version << ")" << dendl;
+ }
+ }
+ // we may have logged ahead of summary.version, but never ahead of paxos
+ if (external_log_to > get_last_committed()) {
+ derr << __func__ << " rewinding external_log_to from " << external_log_to
+ << " -> " << get_last_committed() << " (sync_force? mon rebuild?)" << dendl;
+ external_log_to = get_last_committed();
+ }
+ if (external_log_to >= summary.version) {
+ return;
+ }
+ if (auto first = get_first_committed(); external_log_to < first) {
+ derr << __func__ << " local logs at " << external_log_to
+ << ", skipping to " << first << dendl;
+ external_log_to = first;
+ // FIXME: write marker in each channel log file?
+ }
+ for (; external_log_to < summary.version; ++external_log_to) {
+ bufferlist bl;
+ int err = get_version(external_log_to+1, bl);
+ ceph_assert(err == 0);
+ ceph_assert(bl.length());
+ auto p = bl.cbegin();
+ __u8 v;
+ decode(v, p);
+ int32_t num = -2;
+ if (v >= 2) {
+ decode(num, p);
+ }
+ while ((num == -2 && !p.end()) || (num >= 0 && num--)) {
+ LogEntry le;
+ le.decode(p);
+ log_external(le);
+ }
+ }
+ mon.store->write_meta("external_log_to", stringify(external_log_to));
+}
+
+void LogMonitor::create_pending()
+{
+ pending_log.clear();
+ pending_keys.clear();
+ dout(10) << "create_pending v " << (get_last_committed() + 1) << dendl;
+}
+
+void LogMonitor::generate_logentry_key(
+ const std::string& channel,
+ version_t v,
+ std::string *out)
+{
+ out->append(channel);
+ out->append("/");
+ char vs[10];
+ snprintf(vs, sizeof(vs), "%08llx", (unsigned long long)v);
+ out->append(vs);
+}
+
+void LogMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+ version_t version = get_last_committed() + 1;
+ bufferlist bl;
+ dout(10) << __func__ << " v" << version << dendl;
+
+ if (mon.monmap->min_mon_release < ceph_release_t::quincy) {
+ // legacy encoding for pre-quincy quorum
+ __u8 struct_v = 1;
+ encode(struct_v, bl);
+ for (auto& p : pending_log) {
+ p.second.encode(bl, mon.get_quorum_con_features());
+ }
+ put_version(t, version, bl);
+ put_last_committed(t, version);
+ return;
+ }
+
+ __u8 struct_v = 2;
+ encode(struct_v, bl);
+
+ // first commit after upgrading to quincy?
+ if (!summary.tail_by_channel.empty()) {
+ // include past log entries
+ for (auto& p : summary.tail_by_channel) {
+ for (auto& q : p.second) {
+ pending_log.emplace(make_pair(q.second.stamp, q.second));
+ }
+ }
+ }
+
+ // record new entries
+ auto pending_channel_info = summary.channel_info;
+ uint32_t num = pending_log.size();
+ encode(num, bl);
+ dout(20) << __func__ << " writing " << num << " entries" << dendl;
+ for (auto& p : pending_log) {
+ bufferlist ebl;
+ p.second.encode(ebl, mon.get_quorum_con_features());
+
+ auto& bounds = pending_channel_info[p.second.channel];
+ version_t v = bounds.second++;
+ std::string key;
+ generate_logentry_key(p.second.channel, v, &key);
+ t->put(get_service_name(), key, ebl);
+
+ bl.claim_append(ebl);
+ }
+
+ // prune log entries?
+ map<string,version_t> prune_channels_to;
+ for (auto& [channel, info] : summary.channel_info) {
+ if (info.second - info.first > g_conf()->mon_log_max) {
+ const version_t from = info.first;
+ const version_t to = info.second - g_conf()->mon_log_max;
+ dout(10) << __func__ << " pruning channel " << channel
+ << " " << from << " -> " << to << dendl;
+ prune_channels_to[channel] = to;
+ pending_channel_info[channel].first = to;
+ for (version_t v = from; v < to; ++v) {
+ std::string key;
+ generate_logentry_key(channel, v, &key);
+ t->erase(get_service_name(), key);
+ }
+ }
+ }
+ dout(20) << __func__ << " prune_channels_to " << prune_channels_to << dendl;
+ encode(prune_channels_to, bl);
+
+ put_version(t, version, bl);
+ put_last_committed(t, version);
+}
+
+bool LogMonitor::should_stash_full()
+{
+ if (mon.monmap->min_mon_release < ceph_release_t::quincy) {
+ // commit a LogSummary on every commit
+ return true;
+ }
+
+ // store periodic summary
+ auto period = std::min<uint64_t>(
+ g_conf()->mon_log_full_interval,
+ g_conf()->mon_max_log_epochs
+ );
+ return (get_last_committed() - get_version_latest_full() > period);
+}
+
+
+void LogMonitor::encode_full(MonitorDBStore::TransactionRef t)
+{
+ dout(10) << __func__ << " log v " << summary.version << dendl;
+ ceph_assert(get_last_committed() == summary.version);
+
+ bufferlist summary_bl;
+ encode(summary, summary_bl, mon.get_quorum_con_features());
+
+ put_version_full(t, summary.version, summary_bl);
+ put_version_latest_full(t, summary.version);
+}
+
+version_t LogMonitor::get_trim_to() const
+{
+ if (!mon.is_leader())
+ return 0;
+
+ unsigned max = g_conf()->mon_max_log_epochs;
+ version_t version = get_last_committed();
+ if (version > max)
+ return version - max;
+ return 0;
+}
+
+bool LogMonitor::preprocess_query(MonOpRequestRef op)
+{
+ op->mark_logmon_event("preprocess_query");
+ auto m = op->get_req<PaxosServiceMessage>();
+ dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
+ switch (m->get_type()) {
+ case MSG_MON_COMMAND:
+ try {
+ return preprocess_command(op);
+ } catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
+
+ case MSG_LOG:
+ return preprocess_log(op);
+
+ default:
+ ceph_abort();
+ return true;
+ }
+}
+
+bool LogMonitor::prepare_update(MonOpRequestRef op)
+{
+ op->mark_logmon_event("prepare_update");
+ auto m = op->get_req<PaxosServiceMessage>();
+ dout(10) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
+ switch (m->get_type()) {
+ case MSG_MON_COMMAND:
+ try {
+ return prepare_command(op);
+ } catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
+ case MSG_LOG:
+ return prepare_log(op);
+ default:
+ ceph_abort();
+ return false;
+ }
+}
+
+bool LogMonitor::preprocess_log(MonOpRequestRef op)
+{
+ op->mark_logmon_event("preprocess_log");
+ auto m = op->get_req<MLog>();
+ dout(10) << "preprocess_log " << *m << " from " << m->get_orig_source() << dendl;
+ int num_new = 0;
+
+ MonSession *session = op->get_session();
+ if (!session)
+ goto done;
+ if (!session->is_capable("log", MON_CAP_W)) {
+ dout(0) << "preprocess_log got MLog from entity with insufficient privileges "
+ << session->caps << dendl;
+ goto done;
+ }
+
+ for (auto p = m->entries.begin();
+ p != m->entries.end();
+ ++p) {
+ if (!summary.contains(p->key()))
+ num_new++;
+ }
+ if (!num_new) {
+ dout(10) << " nothing new" << dendl;
+ goto done;
+ }
+
+ return false;
+
+ done:
+ mon.no_reply(op);
+ return true;
+}
+
+struct LogMonitor::C_Log : public C_MonOp {
+ LogMonitor *logmon;
+ C_Log(LogMonitor *p, MonOpRequestRef o) :
+ C_MonOp(o), logmon(p) {}
+ void _finish(int r) override {
+ if (r == -ECANCELED) {
+ return;
+ }
+ logmon->_updated_log(op);
+ }
+};
+
+bool LogMonitor::prepare_log(MonOpRequestRef op)
+{
+ op->mark_logmon_event("prepare_log");
+ auto m = op->get_req<MLog>();
+ dout(10) << "prepare_log " << *m << " from " << m->get_orig_source() << dendl;
+
+ if (m->fsid != mon.monmap->fsid) {
+ dout(0) << "handle_log on fsid " << m->fsid << " != " << mon.monmap->fsid
+ << dendl;
+ return false;
+ }
+
+ for (auto p = m->entries.begin();
+ p != m->entries.end();
+ ++p) {
+ dout(10) << " logging " << *p << dendl;
+ if (!summary.contains(p->key()) &&
+ !pending_keys.count(p->key())) {
+ pending_keys.insert(p->key());
+ pending_log.insert(pair<utime_t,LogEntry>(p->stamp, *p));
+ }
+ }
+ wait_for_commit(op, new C_Log(this, op));
+ return true;
+}
+
+void LogMonitor::_updated_log(MonOpRequestRef op)
+{
+ auto m = op->get_req<MLog>();
+ dout(7) << "_updated_log for " << m->get_orig_source_inst() << dendl;
+ mon.send_reply(op, new MLogAck(m->fsid, m->entries.rbegin()->seq));
+}
+
+bool LogMonitor::should_propose(double& delay)
+{
+ // commit now if we have a lot of pending events
+ if (g_conf()->mon_max_log_entries_per_event > 0 &&
+ pending_log.size() >= (unsigned)g_conf()->mon_max_log_entries_per_event)
+ return true;
+
+ // otherwise fall back to generic policy
+ return PaxosService::should_propose(delay);
+}
+
+
+bool LogMonitor::preprocess_command(MonOpRequestRef op)
+{
+ op->mark_logmon_event("preprocess_command");
+ auto m = op->get_req<MMonCommand>();
+ int r = -EINVAL;
+ bufferlist rdata;
+ stringstream ss;
+
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, get_last_committed());
+ return true;
+ }
+ MonSession *session = op->get_session();
+ if (!session) {
+ mon.reply_command(op, -EACCES, "access denied", get_last_committed());
+ return true;
+ }
+
+ string prefix;
+ cmd_getval(cmdmap, "prefix", prefix);
+
+ string format = cmd_getval_or<string>(cmdmap, "format", "plain");
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+ if (prefix == "log last") {
+ int64_t num = 20;
+ cmd_getval(cmdmap, "num", num);
+ if (f) {
+ f->open_array_section("tail");
+ }
+
+ std::string level_str;
+ clog_type level;
+ if (cmd_getval(cmdmap, "level", level_str)) {
+ level = LogEntry::str_to_level(level_str);
+ if (level == CLOG_UNKNOWN) {
+ ss << "Invalid severity '" << level_str << "'";
+ mon.reply_command(op, -EINVAL, ss.str(), get_last_committed());
+ return true;
+ }
+ } else {
+ level = CLOG_INFO;
+ }
+
+ std::string channel;
+ if (!cmd_getval(cmdmap, "channel", channel)) {
+ channel = CLOG_CHANNEL_DEFAULT;
+ }
+
+ // We'll apply this twice, once while counting out lines
+ // and once while outputting them.
+ auto match = [level](const LogEntry &entry) {
+ return entry.prio >= level;
+ };
+
+ ostringstream ss;
+ if (!summary.tail_by_channel.empty()) {
+ // pre-quincy compat
+ // Decrement operation that sets to container end when hitting rbegin
+ if (channel == "*") {
+ list<LogEntry> full_tail;
+ summary.build_ordered_tail_legacy(&full_tail);
+ auto rp = full_tail.rbegin();
+ for (; num > 0 && rp != full_tail.rend(); ++rp) {
+ if (match(*rp)) {
+ num--;
+ }
+ }
+ if (rp == full_tail.rend()) {
+ --rp;
+ }
+
+ // Decrement a reverse iterator such that going past rbegin()
+ // sets it to rend(). This is for writing a for() loop that
+ // goes up to (and including) rbegin()
+ auto dec = [&rp, &full_tail] () {
+ if (rp == full_tail.rbegin()) {
+ rp = full_tail.rend();
+ } else {
+ --rp;
+ }
+ };
+
+ // Move forward to the end of the container (decrement the reverse
+ // iterator).
+ for (; rp != full_tail.rend(); dec()) {
+ if (!match(*rp)) {
+ continue;
+ }
+ if (f) {
+ f->dump_object("entry", *rp);
+ } else {
+ ss << *rp << "\n";
+ }
+ }
+ } else {
+ auto p = summary.tail_by_channel.find(channel);
+ if (p != summary.tail_by_channel.end()) {
+ auto rp = p->second.rbegin();
+ for (; num > 0 && rp != p->second.rend(); ++rp) {
+ if (match(rp->second)) {
+ num--;
+ }
+ }
+ if (rp == p->second.rend()) {
+ --rp;
+ }
+
+ // Decrement a reverse iterator such that going past rbegin()
+ // sets it to rend(). This is for writing a for() loop that
+ // goes up to (and including) rbegin()
+ auto dec = [&rp, &p] () {
+ if (rp == p->second.rbegin()) {
+ rp = p->second.rend();
+ } else {
+ --rp;
+ }
+ };
+
+ // Move forward to the end of the container (decrement the reverse
+ // iterator).
+ for (; rp != p->second.rend(); dec()) {
+ if (!match(rp->second)) {
+ continue;
+ }
+ if (f) {
+ f->dump_object("entry", rp->second);
+ } else {
+ ss << rp->second << "\n";
+ }
+ }
+ }
+ }
+ } else {
+ // quincy+
+ if (channel == "*") {
+ // tail all channels; we need to mix by timestamp
+ multimap<utime_t,LogEntry> entries; // merge+sort all channels by timestamp
+ for (auto& p : summary.channel_info) {
+ version_t from = p.second.first;
+ version_t to = p.second.second;
+ version_t start;
+ if (to > (version_t)num) {
+ start = std::max(to - num, from);
+ } else {
+ start = from;
+ }
+ dout(10) << __func__ << " channel " << p.first
+ << " from " << from << " to " << to << dendl;
+ for (version_t v = start; v < to; ++v) {
+ bufferlist ebl;
+ string key;
+ generate_logentry_key(p.first, v, &key);
+ int r = mon.store->get(get_service_name(), key, ebl);
+ if (r < 0) {
+ derr << __func__ << " missing key " << key << dendl;
+ continue;
+ }
+ LogEntry le;
+ auto p = ebl.cbegin();
+ decode(le, p);
+ entries.insert(make_pair(le.stamp, le));
+ }
+ }
+ while ((int)entries.size() > num) {
+ entries.erase(entries.begin());
+ }
+ for (auto& p : entries) {
+ if (!match(p.second)) {
+ continue;
+ }
+ if (f) {
+ f->dump_object("entry", p.second);
+ } else {
+ ss << p.second << "\n";
+ }
+ }
+ } else {
+ // tail one channel
+ auto p = summary.channel_info.find(channel);
+ if (p != summary.channel_info.end()) {
+ version_t from = p->second.first;
+ version_t to = p->second.second;
+ version_t start;
+ if (to > (version_t)num) {
+ start = std::max(to - num, from);
+ } else {
+ start = from;
+ }
+ dout(10) << __func__ << " from " << from << " to " << to << dendl;
+ for (version_t v = start; v < to; ++v) {
+ bufferlist ebl;
+ string key;
+ generate_logentry_key(channel, v, &key);
+ int r = mon.store->get(get_service_name(), key, ebl);
+ if (r < 0) {
+ derr << __func__ << " missing key " << key << dendl;
+ continue;
+ }
+ LogEntry le;
+ auto p = ebl.cbegin();
+ decode(le, p);
+ if (match(le)) {
+ if (f) {
+ f->dump_object("entry", le);
+ } else {
+ ss << le << "\n";
+ }
+ }
+ }
+ }
+ }
+ }
+ if (f) {
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ rdata.append(ss.str());
+ }
+ r = 0;
+ } else {
+ return false;
+ }
+
+ string rs;
+ getline(ss, rs);
+ mon.reply_command(op, r, rs, rdata, get_last_committed());
+ return true;
+}
+
+
+bool LogMonitor::prepare_command(MonOpRequestRef op)
+{
+ op->mark_logmon_event("prepare_command");
+ auto m = op->get_req<MMonCommand>();
+ stringstream ss;
+ string rs;
+ int err = -EINVAL;
+
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ // ss has reason for failure
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, get_last_committed());
+ return true;
+ }
+
+ string prefix;
+ cmd_getval(cmdmap, "prefix", prefix);
+
+ MonSession *session = op->get_session();
+ if (!session) {
+ mon.reply_command(op, -EACCES, "access denied", get_last_committed());
+ return true;
+ }
+
+ if (prefix == "log") {
+ vector<string> logtext;
+ cmd_getval(cmdmap, "logtext", logtext);
+ LogEntry le;
+ le.rank = m->get_orig_source();
+ le.addrs.v.push_back(m->get_orig_source_addr());
+ le.name = session->entity_name;
+ le.stamp = m->get_recv_stamp();
+ le.seq = 0;
+ string level_str = cmd_getval_or<string>(cmdmap, "level", "info");
+ le.prio = LogEntry::str_to_level(level_str);
+ le.channel = CLOG_CHANNEL_DEFAULT;
+ le.msg = str_join(logtext, " ");
+ pending_keys.insert(le.key());
+ pending_log.insert(pair<utime_t,LogEntry>(le.stamp, le));
+ wait_for_commit(op, new Monitor::C_Command(
+ mon, op, 0, string(), get_last_committed() + 1));
+ return true;
+ }
+
+ getline(ss, rs);
+ mon.reply_command(op, err, rs, get_last_committed());
+ return false;
+}
+
+void LogMonitor::dump_info(Formatter *f)
+{
+ f->dump_unsigned("logm_first_committed", get_first_committed());
+ f->dump_unsigned("logm_last_committed", get_last_committed());
+}
+
+int LogMonitor::sub_name_to_id(const string& n)
+{
+ if (n.substr(0, 4) == "log-" && n.size() > 4) {
+ return LogEntry::str_to_level(n.substr(4));
+ } else {
+ return CLOG_UNKNOWN;
+ }
+}
+
+void LogMonitor::check_subs()
+{
+ dout(10) << __func__ << dendl;
+ for (map<string, xlist<Subscription*>*>::iterator i = mon.session_map.subs.begin();
+ i != mon.session_map.subs.end();
+ ++i) {
+ for (xlist<Subscription*>::iterator j = i->second->begin(); !j.end(); ++j) {
+ if (sub_name_to_id((*j)->type) >= 0)
+ check_sub(*j);
+ }
+ }
+}
+
+void LogMonitor::check_sub(Subscription *s)
+{
+ dout(10) << __func__ << " client wants " << s->type << " ver " << s->next << dendl;
+
+ int sub_level = sub_name_to_id(s->type);
+ ceph_assert(sub_level >= 0);
+
+ version_t summary_version = summary.version;
+ if (s->next > summary_version) {
+ dout(10) << __func__ << " client " << s->session->name
+ << " requested version (" << s->next << ") is greater than ours ("
+ << summary_version << "), which means we already sent him"
+ << " everything we have." << dendl;
+ return;
+ }
+
+ MLog *mlog = new MLog(mon.monmap->fsid);
+
+ if (s->next == 0) {
+ /* First timer, heh? */
+ _create_sub_incremental(mlog, sub_level, get_last_committed());
+ } else {
+ /* let us send you an incremental log... */
+ _create_sub_incremental(mlog, sub_level, s->next);
+ }
+
+ dout(10) << __func__ << " sending message to " << s->session->name
+ << " with " << mlog->entries.size() << " entries"
+ << " (version " << mlog->version << ")" << dendl;
+
+ if (!mlog->entries.empty()) {
+ s->session->con->send_message(mlog);
+ } else {
+ mlog->put();
+ }
+ if (s->onetime)
+ mon.session_map.remove_sub(s);
+ else
+ s->next = summary_version+1;
+}
+
+/**
+ * Create an incremental log message from version \p sv to \p summary.version
+ *
+ * @param mlog Log message we'll send to the client with the messages received
+ * since version \p sv, inclusive.
+ * @param level The max log level of the messages the client is interested in.
+ * @param sv The version the client is looking for.
+ */
+void LogMonitor::_create_sub_incremental(MLog *mlog, int level, version_t sv)
+{
+ dout(10) << __func__ << " level " << level << " ver " << sv
+ << " cur summary ver " << summary.version << dendl;
+
+ if (sv < get_first_committed()) {
+ dout(10) << __func__ << " skipped from " << sv
+ << " to first_committed " << get_first_committed() << dendl;
+ LogEntry le;
+ le.stamp = ceph_clock_now();
+ le.prio = CLOG_WARN;
+ ostringstream ss;
+ ss << "skipped log messages from " << sv << " to " << get_first_committed();
+ le.msg = ss.str();
+ mlog->entries.push_back(le);
+ sv = get_first_committed();
+ }
+
+ version_t summary_ver = summary.version;
+ while (sv && sv <= summary_ver) {
+ bufferlist bl;
+ int err = get_version(sv, bl);
+ ceph_assert(err == 0);
+ ceph_assert(bl.length());
+ auto p = bl.cbegin();
+ __u8 v;
+ decode(v, p);
+ int32_t num = -2;
+ if (v >= 2) {
+ decode(num, p);
+ dout(20) << __func__ << " sv " << sv << " has " << num << " entries" << dendl;
+ }
+ while ((num == -2 && !p.end()) || (num >= 0 && num--)) {
+ LogEntry le;
+ le.decode(p);
+ if (le.prio < level) {
+ dout(20) << __func__ << " requested " << level
+ << ", skipping " << le << dendl;
+ continue;
+ }
+ mlog->entries.push_back(le);
+ }
+ mlog->version = sv++;
+ }
+
+ dout(10) << __func__ << " incremental message ready ("
+ << mlog->entries.size() << " entries)" << dendl;
+}
+
+void LogMonitor::update_log_channels()
+{
+ ostringstream oss;
+
+ channels.clear();
+
+ int r = get_conf_str_map_helper(
+ g_conf().get_val<string>("mon_cluster_log_to_syslog"),
+ oss, &channels.log_to_syslog,
+ CLOG_CONFIG_DEFAULT_KEY);
+ if (r < 0) {
+ derr << __func__ << " error parsing 'mon_cluster_log_to_syslog'" << dendl;
+ return;
+ }
+
+ r = get_conf_str_map_helper(
+ g_conf().get_val<string>("mon_cluster_log_to_syslog_level"),
+ oss, &channels.syslog_level,
+ CLOG_CONFIG_DEFAULT_KEY);
+ if (r < 0) {
+ derr << __func__ << " error parsing 'mon_cluster_log_to_syslog_level'"
+ << dendl;
+ return;
+ }
+
+ r = get_conf_str_map_helper(
+ g_conf().get_val<string>("mon_cluster_log_to_syslog_facility"),
+ oss, &channels.syslog_facility,
+ CLOG_CONFIG_DEFAULT_KEY);
+ if (r < 0) {
+ derr << __func__ << " error parsing 'mon_cluster_log_to_syslog_facility'"
+ << dendl;
+ return;
+ }
+
+ r = get_conf_str_map_helper(
+ g_conf().get_val<string>("mon_cluster_log_file"), oss,
+ &channels.log_file,
+ CLOG_CONFIG_DEFAULT_KEY);
+ if (r < 0) {
+ derr << __func__ << " error parsing 'mon_cluster_log_file'" << dendl;
+ return;
+ }
+
+ r = get_conf_str_map_helper(
+ g_conf().get_val<string>("mon_cluster_log_file_level"), oss,
+ &channels.log_file_level,
+ CLOG_CONFIG_DEFAULT_KEY);
+ if (r < 0) {
+ derr << __func__ << " error parsing 'mon_cluster_log_file_level'"
+ << dendl;
+ return;
+ }
+
+ r = get_conf_str_map_helper(
+ g_conf().get_val<string>("mon_cluster_log_to_graylog"), oss,
+ &channels.log_to_graylog,
+ CLOG_CONFIG_DEFAULT_KEY);
+ if (r < 0) {
+ derr << __func__ << " error parsing 'mon_cluster_log_to_graylog'"
+ << dendl;
+ return;
+ }
+
+ r = get_conf_str_map_helper(
+ g_conf().get_val<string>("mon_cluster_log_to_graylog_host"), oss,
+ &channels.log_to_graylog_host,
+ CLOG_CONFIG_DEFAULT_KEY);
+ if (r < 0) {
+ derr << __func__ << " error parsing 'mon_cluster_log_to_graylog_host'"
+ << dendl;
+ return;
+ }
+
+ r = get_conf_str_map_helper(
+ g_conf().get_val<string>("mon_cluster_log_to_graylog_port"), oss,
+ &channels.log_to_graylog_port,
+ CLOG_CONFIG_DEFAULT_KEY);
+ if (r < 0) {
+ derr << __func__ << " error parsing 'mon_cluster_log_to_graylog_port'"
+ << dendl;
+ return;
+ }
+
+ r = get_conf_str_map_helper(
+ g_conf().get_val<string>("mon_cluster_log_to_journald"), oss,
+ &channels.log_to_journald,
+ CLOG_CONFIG_DEFAULT_KEY);
+ if (r < 0) {
+ derr << __func__ << " error parsing 'mon_cluster_log_to_journald'"
+ << dendl;
+ return;
+ }
+
+ channels.expand_channel_meta();
+ log_external_close_fds();
+}
+
+
+void LogMonitor::handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed)
+{
+ if (changed.count("mon_cluster_log_to_syslog") ||
+ changed.count("mon_cluster_log_to_syslog_level") ||
+ changed.count("mon_cluster_log_to_syslog_facility") ||
+ changed.count("mon_cluster_log_file") ||
+ changed.count("mon_cluster_log_file_level") ||
+ changed.count("mon_cluster_log_to_graylog") ||
+ changed.count("mon_cluster_log_to_graylog_host") ||
+ changed.count("mon_cluster_log_to_graylog_port") ||
+ changed.count("mon_cluster_log_to_journald") ||
+ changed.count("mon_cluster_log_to_file")) {
+ update_log_channels();
+ }
+}
diff --git a/src/mon/LogMonitor.h b/src/mon/LogMonitor.h
new file mode 100644
index 000000000..1eccaa5c2
--- /dev/null
+++ b/src/mon/LogMonitor.h
@@ -0,0 +1,210 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LOGMONITOR_H
+#define CEPH_LOGMONITOR_H
+
+#include <atomic>
+#include <map>
+#include <set>
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "include/types.h"
+#include "PaxosService.h"
+
+#include "common/config_fwd.h"
+#include "common/LogEntry.h"
+#include "include/str_map.h"
+
+class MLog;
+
+static const std::string LOG_META_CHANNEL = "$channel";
+
+namespace ceph {
+namespace logging {
+ class Graylog;
+ class JournaldClusterLogger;
+}
+}
+
+class LogMonitor : public PaxosService,
+ public md_config_obs_t {
+private:
+ std::multimap<utime_t,LogEntry> pending_log;
+ unordered_set<LogEntryKey> pending_keys;
+
+ LogSummary summary;
+
+ version_t external_log_to = 0;
+ std::map<std::string, int> channel_fds;
+
+ fmt::memory_buffer log_buffer;
+ std::atomic<bool> log_rotated = false;
+
+ struct log_channel_info {
+
+ std::map<std::string,std::string> log_to_syslog;
+ std::map<std::string,std::string> syslog_level;
+ std::map<std::string,std::string> syslog_facility;
+ std::map<std::string,std::string> log_file;
+ std::map<std::string,std::string> expanded_log_file;
+ std::map<std::string,std::string> log_file_level;
+ std::map<std::string,std::string> log_to_graylog;
+ std::map<std::string,std::string> log_to_graylog_host;
+ std::map<std::string,std::string> log_to_graylog_port;
+ std::map<std::string,std::string> log_to_journald;
+
+ std::map<std::string, std::shared_ptr<ceph::logging::Graylog>> graylogs;
+ std::unique_ptr<ceph::logging::JournaldClusterLogger> journald;
+ uuid_d fsid;
+ std::string host;
+
+ log_channel_info();
+ ~log_channel_info();
+
+ void clear();
+
+ /** expands $channel meta variable on all maps *EXCEPT* log_file
+ *
+ * We won't expand the log_file map meta variables here because we
+ * intend to do that selectively during get_log_file()
+ */
+ void expand_channel_meta() {
+ expand_channel_meta(log_to_syslog);
+ expand_channel_meta(syslog_level);
+ expand_channel_meta(syslog_facility);
+ expand_channel_meta(log_file_level);
+ }
+ void expand_channel_meta(std::map<std::string,std::string> &m);
+ std::string expand_channel_meta(const std::string &input,
+ const std::string &change_to);
+
+ bool do_log_to_syslog(const std::string &channel);
+
+ std::string get_facility(const std::string &channel) {
+ return get_str_map_key(syslog_facility, channel,
+ &CLOG_CONFIG_DEFAULT_KEY);
+ }
+
+ std::string get_level(const std::string &channel) {
+ return get_str_map_key(syslog_level, channel,
+ &CLOG_CONFIG_DEFAULT_KEY);
+ }
+
+ std::string get_log_file(const std::string &channel);
+
+ std::string get_log_file_level(const std::string &channel) {
+ return get_str_map_key(log_file_level, channel,
+ &CLOG_CONFIG_DEFAULT_KEY);
+ }
+
+ bool do_log_to_graylog(const std::string &channel) {
+ return (get_str_map_key(log_to_graylog, channel,
+ &CLOG_CONFIG_DEFAULT_KEY) == "true");
+ }
+
+ std::shared_ptr<ceph::logging::Graylog> get_graylog(const std::string &channel);
+
+ bool do_log_to_journald(const std::string &channel) {
+ return (get_str_map_key(log_to_journald, channel,
+ &CLOG_CONFIG_DEFAULT_KEY) == "true");
+ }
+
+ ceph::logging::JournaldClusterLogger &get_journald();
+ } channels;
+
+ void update_log_channels();
+
+ void create_initial() override;
+ void update_from_paxos(bool *need_bootstrap) override;
+ void create_pending() override; // prepare a new pending
+ // propose pending update to peers
+ void generate_logentry_key(const std::string& channel, version_t v, std::string *out);
+ void encode_pending(MonitorDBStore::TransactionRef t) override;
+ void encode_full(MonitorDBStore::TransactionRef t) override;
+ version_t get_trim_to() const override;
+ bool preprocess_query(MonOpRequestRef op) override; // true if processed.
+ bool prepare_update(MonOpRequestRef op) override;
+
+ bool preprocess_log(MonOpRequestRef op);
+ bool prepare_log(MonOpRequestRef op);
+ void _updated_log(MonOpRequestRef op);
+
+ bool should_propose(double& delay) override;
+
+ bool should_stash_full() override;
+
+ struct C_Log;
+
+ bool preprocess_command(MonOpRequestRef op);
+ bool prepare_command(MonOpRequestRef op);
+
+ void _create_sub_incremental(MLog *mlog, int level, version_t sv);
+
+ public:
+ LogMonitor(Monitor &mn, Paxos &p, const std::string& service_name)
+ : PaxosService(mn, p, service_name) { }
+
+ void init() override {
+ generic_dout(10) << "LogMonitor::init" << dendl;
+ g_conf().add_observer(this);
+ update_log_channels();
+ }
+
+ void tick() override; // check state, take actions
+
+ void dump_info(Formatter *f);
+ void check_subs();
+ void check_sub(Subscription *s);
+
+ void reopen_logs() {
+ this->log_rotated.store(true);
+ }
+ void log_external_close_fds();
+ void log_external(const LogEntry& le);
+ void log_external_backlog();
+
+ /**
+ * translate log sub name ('log-info') to integer id
+ *
+ * @param n name
+ * @return id, or -1 if unrecognized
+ */
+ int sub_name_to_id(const std::string& n);
+
+ void on_shutdown() override {
+ g_conf().remove_observer(this);
+ }
+
+ const char **get_tracked_conf_keys() const override {
+ static const char* KEYS[] = {
+ "mon_cluster_log_to_syslog",
+ "mon_cluster_log_to_syslog_level",
+ "mon_cluster_log_to_syslog_facility",
+ "mon_cluster_log_file",
+ "mon_cluster_log_file_level",
+ "mon_cluster_log_to_graylog",
+ "mon_cluster_log_to_graylog_host",
+ "mon_cluster_log_to_graylog_port",
+ "mon_cluster_log_to_journald",
+ NULL
+ };
+ return KEYS;
+ }
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed) override;
+};
+#endif
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
new file mode 100644
index 000000000..76a57ac44
--- /dev/null
+++ b/src/mon/MDSMonitor.cc
@@ -0,0 +1,2547 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <regex>
+#include <sstream>
+#include <queue>
+#include <ranges>
+#include <boost/range/adaptors.hpp>
+#include <boost/utility.hpp>
+
+#include "MDSMonitor.h"
+#include "FSCommands.h"
+#include "Monitor.h"
+#include "MonitorDBStore.h"
+#include "OSDMonitor.h"
+
+#include "common/strtol.h"
+#include "common/perf_counters.h"
+#include "common/config.h"
+#include "common/cmdparse.h"
+#include "messages/MMDSMap.h"
+#include "messages/MFSMap.h"
+#include "messages/MFSMapUser.h"
+#include "messages/MMDSLoadTargets.h"
+#include "messages/MMonCommand.h"
+#include "messages/MGenericMessage.h"
+
+#include "include/ceph_assert.h"
+#include "include/str_list.h"
+#include "include/stringify.h"
+#include "mds/mdstypes.h"
+#include "Session.h"
+
+using namespace TOPNSPC::common;
+
+using std::list;
+using std::map;
+using std::ostream;
+using std::ostringstream;
+using std::set;
+using std::string;
+using std::string_view;
+using std::stringstream;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::make_message;
+using ceph::mono_clock;
+using ceph::mono_time;
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, get_fsmap())
+
+static ostream& _prefix(ostream *_dout, Monitor &mon, const FSMap& fsmap) {
+ return *_dout << "mon." << mon.name << "@" << mon.rank
+ << "(" << mon.get_state_name()
+ << ").mds e" << fsmap.get_epoch() << " ";
+}
+
+static const string MDS_METADATA_PREFIX("mds_metadata");
+static const string MDS_HEALTH_PREFIX("mds_health");
+
+
+/*
+ * Specialized implementation of cmd_getval to allow us to parse
+ * out strongly-typedef'd types
+ */
+namespace TOPNSPC::common {
+template<> bool cmd_getval(const cmdmap_t& cmdmap,
+ string_view k, mds_gid_t &val)
+{
+ return cmd_getval(cmdmap, k, (int64_t&)val);
+}
+
+template<> bool cmd_getval(const cmdmap_t& cmdmap,
+ string_view k, mds_rank_t &val)
+{
+ return cmd_getval(cmdmap, k, (int64_t&)val);
+}
+
+template<> bool cmd_getval(const cmdmap_t& cmdmap,
+ string_view k, MDSMap::DaemonState &val)
+{
+ return cmd_getval(cmdmap, k, (int64_t&)val);
+}
+}
+// my methods
+
+template <int dblV>
+void MDSMonitor::print_map(const FSMap& m)
+{
+ dout(dblV) << "print_map\n";
+ m.print(*_dout);
+ *_dout << dendl;
+}
+
+// service methods
+void MDSMonitor::create_initial()
+{
+ dout(10) << "create_initial" << dendl;
+}
+
+void MDSMonitor::get_store_prefixes(set<string>& s) const
+{
+ s.insert(service_name);
+ s.insert(MDS_METADATA_PREFIX);
+ s.insert(MDS_HEALTH_PREFIX);
+}
+
+void MDSMonitor::update_from_paxos(bool *need_bootstrap)
+{
+ version_t version = get_last_committed();
+ if (version == get_fsmap().get_epoch())
+ return;
+
+ dout(10) << __func__ << " version " << version
+ << ", my e " << get_fsmap().get_epoch() << dendl;
+ ceph_assert(version > get_fsmap().get_epoch());
+
+ load_metadata(pending_metadata);
+ load_health();
+
+ // read and decode
+ bufferlist fsmap_bl;
+ fsmap_bl.clear();
+ int err = get_version(version, fsmap_bl);
+ ceph_assert(err == 0);
+
+ ceph_assert(fsmap_bl.length() > 0);
+ dout(10) << __func__ << " got " << version << dendl;
+ try {
+ PaxosFSMap::decode(fsmap_bl);
+ } catch (const ceph::buffer::malformed_input& e) {
+ derr << "unable to decode FSMap: " << e.what() << dendl;
+ throw;
+ }
+
+ // new map
+ dout(0) << "new map" << dendl;
+ print_map<0>(get_fsmap());
+ if (!g_conf()->mon_mds_skip_sanity) {
+ get_fsmap().sanity();
+ }
+
+ check_subs();
+}
+
+void MDSMonitor::init()
+{
+ (void)load_metadata(pending_metadata);
+}
+
+void MDSMonitor::create_pending()
+{
+ auto &fsmap = PaxosFSMap::create_pending();
+
+ if (mon.osdmon()->is_readable()) {
+ const auto &osdmap = mon.osdmon()->osdmap;
+ fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
+ }
+
+ dout(10) << "create_pending e" << fsmap.get_epoch() << dendl;
+}
+
+void MDSMonitor::assign_quiesce_db_leader(FSMap &fsmap) {
+
+ // the quiesce leader is the lowest rank with the highest state up to ACTIVE
+ auto less_leader = [](MDSMap::mds_info_t const* l, MDSMap::mds_info_t const* r) {
+ ceph_assert(l->rank != MDS_RANK_NONE);
+ ceph_assert(r->rank != MDS_RANK_NONE);
+ ceph_assert(l->state <= MDSMap::STATE_ACTIVE);
+ ceph_assert(r->state <= MDSMap::STATE_ACTIVE);
+ if (l->rank == r->rank) {
+ return l->state < r->state;
+ } else {
+ return l->rank > r->rank;
+ }
+ };
+
+ for (const auto& [fscid, fs] : std::as_const(fsmap)) {
+ auto &&mdsmap = fs.get_mds_map();
+
+ if (mdsmap.get_epoch() < fsmap.get_epoch()) {
+ // no changes in this fs, we can skip the calculation below
+ // NB! be careful with this clause when updating the leader selection logic.
+ // When the input from outside of this fsmap will affect the decision
+ // this clause will have to be updated, too.
+ continue;
+ }
+
+ std::priority_queue<MDSMap::mds_info_t const*, std::vector<MDSMap::mds_info_t const*>, decltype(less_leader)>
+ member_info(less_leader);
+
+ std::unordered_set<mds_gid_t> members;
+
+ for (auto&& [gid, info] : mdsmap.get_mds_info()) {
+ // if it has a rank and state <= ACTIVE, it's good enough
+ // if (info.rank != MDS_RANK_NONE && info.state <= MDSMap::STATE_ACTIVE) {
+ if (info.rank != MDS_RANK_NONE && info.state == MDSMap::STATE_ACTIVE) {
+ member_info.push(&info);
+ members.insert(info.global_id);
+ }
+ }
+
+ auto leader = member_info.empty() ? MDS_GID_NONE : member_info.top()->global_id;
+
+ fsmap.modify_filesystem(fscid, [&leader, &members](auto &writable_fs) -> bool {
+ return writable_fs.get_mds_map().update_quiesce_db_cluster(leader, std::move(members));
+ });
+ }
+}
+
+void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+ auto &pending = get_pending_fsmap_writeable();
+ auto epoch = pending.get_epoch();
+
+ assign_quiesce_db_leader(pending);
+
+ dout(10) << "encode_pending e" << epoch << dendl;
+
+ // print map iff 'debug mon = 30' or higher
+ print_map<30>(pending);
+ if (!g_conf()->mon_mds_skip_sanity) {
+ pending.sanity(true);
+ }
+ pending.set_btime();
+
+ // apply to paxos
+ ceph_assert(get_last_committed() + 1 == pending.get_epoch());
+ bufferlist pending_bl;
+ pending.encode(pending_bl, mon.get_quorum_con_features());
+
+ /* put everything in the transaction */
+ put_version(t, pending.get_epoch(), pending_bl);
+ put_last_committed(t, pending.get_epoch());
+
+ // Encode MDSHealth data
+ for (map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
+ i != pending_daemon_health.end(); ++i) {
+ bufferlist bl;
+ i->second.encode(bl);
+ t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
+ }
+
+ for (set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
+ i != pending_daemon_health_rm.end(); ++i) {
+ t->erase(MDS_HEALTH_PREFIX, stringify(*i));
+ }
+ pending_daemon_health_rm.clear();
+ remove_from_metadata(pending, t);
+
+ // health
+ health_check_map_t new_checks;
+ const auto &info_map = pending.get_mds_info();
+ for (const auto &i : info_map) {
+ const auto &gid = i.first;
+ const auto &info = i.second;
+ if (pending_daemon_health_rm.count(gid)) {
+ continue;
+ }
+ MDSHealth health;
+ auto p = pending_daemon_health.find(gid);
+ if (p != pending_daemon_health.end()) {
+ health = p->second;
+ } else {
+ bufferlist bl;
+ mon.store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
+ if (!bl.length()) {
+ derr << "Missing health data for MDS " << gid << dendl;
+ continue;
+ }
+ auto bl_i = bl.cbegin();
+ health.decode(bl_i);
+ }
+ for (const auto &metric : health.metrics) {
+ if (metric.type == MDS_HEALTH_DUMMY) {
+ continue;
+ }
+ const auto rank = info.rank;
+ health_check_t *check = &new_checks.get_or_add(
+ mds_metric_name(metric.type),
+ metric.sev,
+ mds_metric_summary(metric.type),
+ 1);
+ ostringstream ss;
+ ss << "mds." << info.name << "(mds." << rank << "): " << metric.message;
+ bool first = true;
+ for (auto &p : metric.metadata) {
+ if (first) {
+ ss << " ";
+ } else {
+ ss << ", ";
+ }
+ ss << p.first << ": " << p.second;
+ first = false;
+ }
+ check->detail.push_back(ss.str());
+ }
+ }
+ pending.get_health_checks(&new_checks);
+ for (auto& p : new_checks.checks) {
+ // TODO: handle "client_count" metadata when summarizing
+ p.second.summary = std::regex_replace(
+ p.second.summary,
+ std::regex("%num%"),
+ stringify(p.second.detail.size()));
+ p.second.summary = std::regex_replace(
+ p.second.summary,
+ std::regex("%plurals%"),
+ p.second.detail.size() > 1 ? "s" : "");
+ p.second.summary = std::regex_replace(
+ p.second.summary,
+ std::regex("%isorare%"),
+ p.second.detail.size() > 1 ? "are" : "is");
+ p.second.summary = std::regex_replace(
+ p.second.summary,
+ std::regex("%hasorhave%"),
+ p.second.detail.size() > 1 ? "have" : "has");
+ }
+ encode_health(new_checks, t);
+}
+
+version_t MDSMonitor::get_trim_to() const
+{
+ version_t floor = 0;
+ if (g_conf()->mon_mds_force_trim_to > 0 &&
+ g_conf()->mon_mds_force_trim_to <= (int)get_last_committed()) {
+ floor = g_conf()->mon_mds_force_trim_to;
+ dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
+ << floor << dendl;
+ }
+
+ unsigned max = g_conf()->mon_max_mdsmap_epochs;
+ version_t last = get_last_committed();
+
+ if (last - get_first_committed() > max && floor < last - max) {
+ floor = last-max;
+ }
+
+ dout(20) << __func__ << " = " << floor << dendl;
+ return floor;
+}
+
+bool MDSMonitor::preprocess_query(MonOpRequestRef op)
+{
+ op->mark_mdsmon_event(__func__);
+ auto m = op->get_req<PaxosServiceMessage>();
+ dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source()
+ << " " << m->get_orig_source_addrs() << dendl;
+
+ switch (m->get_type()) {
+
+ case MSG_MDS_BEACON:
+ return preprocess_beacon(op);
+
+ case MSG_MON_COMMAND:
+ try {
+ return preprocess_command(op);
+ } catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
+
+ case MSG_MDS_OFFLOAD_TARGETS:
+ return preprocess_offload_targets(op);
+
+ default:
+ ceph_abort();
+ return true;
+ }
+}
+
+void MDSMonitor::_note_beacon(MMDSBeacon *m)
+{
+ mds_gid_t gid = mds_gid_t(m->get_global_id());
+ version_t seq = m->get_seq();
+
+ dout(5) << "_note_beacon " << *m << " noting time" << dendl;
+ auto &beacon = last_beacon[gid];
+ beacon.stamp = mono_clock::now();
+ beacon.seq = seq;
+}
+
+bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
+{
+ op->mark_mdsmon_event(__func__);
+ auto m = op->get_req<MMDSBeacon>();
+ MDSMap::DaemonState state = m->get_state();
+ mds_gid_t gid = m->get_global_id();
+ version_t seq = m->get_seq();
+ MDSMap::mds_info_t info;
+ epoch_t effective_epoch = 0;
+
+ const auto &fsmap = get_fsmap();
+
+ // check privileges, ignore if fails
+ MonSession *session = op->get_session();
+ if (!session)
+ goto ignore;
+ if (!session->is_capable("mds", MON_CAP_X)) {
+ dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
+ << session->caps << dendl;
+ goto ignore;
+ }
+
+ if (m->get_fsid() != mon.monmap->fsid) {
+ dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon.monmap->fsid << dendl;
+ goto ignore;
+ }
+
+ dout(5) << "preprocess_beacon " << *m
+ << " from " << m->get_orig_source()
+ << " " << m->get_orig_source_addrs()
+ << " " << m->get_compat()
+ << dendl;
+
+ // make sure the address has a port
+ if (m->get_orig_source_addr().get_port() == 0) {
+ dout(1) << " ignoring boot message without a port" << dendl;
+ goto ignore;
+ }
+
+ // fw to leader?
+ if (!is_leader())
+ return false;
+
+ // booted, but not in map?
+ if (!fsmap.gid_exists(gid)) {
+ if (state != MDSMap::STATE_BOOT) {
+ dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
+ << ceph_mds_state_name(state) << ")" << dendl;
+
+ /* We can't send an MDSMap this MDS was a part of because we no longer
+ * know which FS it was part of. Nor does this matter. Sending an empty
+ * MDSMap is sufficient for getting the MDS to respawn.
+ */
+ auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
+ mon.send_reply(op, m.detach());
+ return true;
+ } else {
+ /* check if we've already recorded its entry in pending */
+ const auto& pending = get_pending_fsmap();
+ if (pending.gid_exists(gid)) {
+ /* MDS is already booted. */
+ goto ignore;
+ } else {
+ return false; // not booted yet.
+ }
+ }
+ }
+ dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
+ info = fsmap.get_info_gid(gid);
+
+ if (state == MDSMap::STATE_DNE) {
+ return false;
+ }
+
+ // old seq?
+ if (info.state_seq > seq) {
+ dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
+ goto ignore;
+ }
+
+ // Work out the latest epoch that this daemon should have seen
+ {
+ fs_cluster_id_t fscid = fsmap.fscid_from_gid(gid);
+ if (fscid == FS_CLUSTER_ID_NONE) {
+ effective_epoch = fsmap.get_standby_epochs().at(gid);
+ } else {
+ effective_epoch = fsmap.get_filesystem(fscid).get_mds_map().get_epoch();
+ }
+ if (effective_epoch != m->get_last_epoch_seen()) {
+ dout(10) << "mds_beacon " << *m
+ << " ignoring requested state, because mds hasn't seen latest map" << dendl;
+ goto reply;
+ }
+ }
+
+ if (info.laggy()) {
+ _note_beacon(m);
+ return false; // no longer laggy, need to update map.
+ }
+ if (state == MDSMap::STATE_BOOT) {
+ // ignore, already booted.
+ goto ignore;
+ }
+
+ // did the join_fscid change
+ if (m->get_fs().size()) {
+ fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+ auto* fsp = fsmap.get_filesystem(m->get_fs());
+ if (fsp) {
+ fscid = fsp->get_fscid();
+ }
+ if (info.join_fscid != fscid) {
+ dout(10) << __func__ << " standby mds_join_fs changed to " << fscid
+ << " (" << m->get_fs() << ")" << dendl;
+ _note_beacon(m);
+ return false;
+ }
+ } else {
+ if (info.join_fscid != FS_CLUSTER_ID_NONE) {
+ dout(10) << __func__ << " standby mds_join_fs was cleared" << dendl;
+ _note_beacon(m);
+ return false;
+ }
+ }
+
+ // is there a state change here?
+ if (info.state != state) {
+ _note_beacon(m);
+ return false;
+ }
+
+ // Comparing known daemon health with m->get_health()
+ // and return false (i.e. require proposal) if they
+ // do not match, to update our stored
+ if (!(pending_daemon_health[gid] == m->get_health())) {
+ dout(10) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
+ _note_beacon(m);
+ return false;
+ }
+
+ reply:
+ // note time and reply
+ ceph_assert(effective_epoch > 0);
+ _note_beacon(m);
+ {
+ auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
+ m->get_global_id(), m->get_name(), effective_epoch,
+ state, seq, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ mon.send_reply(op, beacon.detach());
+ }
+ return true;
+
+ ignore:
+ // I won't reply this beacon, drop it.
+ mon.no_reply(op);
+ return true;
+}
+
+bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
+{
+ op->mark_mdsmon_event(__func__);
+ auto m = op->get_req<MMDSLoadTargets>();
+ dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
+
+ const auto &fsmap = get_fsmap();
+
+ // check privileges, ignore message if fails
+ MonSession *session = op->get_session();
+ if (!session)
+ goto ignore;
+ if (!session->is_capable("mds", MON_CAP_X)) {
+ dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
+ << session->caps << dendl;
+ goto ignore;
+ }
+
+ if (fsmap.gid_exists(m->global_id) &&
+ m->targets == fsmap.get_info_gid(m->global_id).export_targets)
+ goto ignore;
+
+ return false;
+
+ ignore:
+ mon.no_reply(op);
+ return true;
+}
+
+
+bool MDSMonitor::prepare_update(MonOpRequestRef op)
+{
+ op->mark_mdsmon_event(__func__);
+ auto m = op->get_req<PaxosServiceMessage>();
+ dout(7) << "prepare_update " << *m << dendl;
+
+ bool r = false;
+
+ /* batch any changes to pending with any changes to osdmap */
+ paxos.plug();
+
+ switch (m->get_type()) {
+ case MSG_MDS_BEACON:
+ r = prepare_beacon(op);
+ break;
+ case MSG_MON_COMMAND:
+ try {
+ r = prepare_command(op);
+ } catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ r = false;
+ }
+ break;
+ case MSG_MDS_OFFLOAD_TARGETS:
+ r = prepare_offload_targets(op);
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+
+ paxos.unplug();
+
+ return r;
+}
+
+bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
+{
+ op->mark_mdsmon_event(__func__);
+ auto m = op->get_req<MMDSBeacon>();
+ // -- this is an update --
+ dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source()
+ << " " << m->get_orig_source_addrs() << dendl;
+ entity_addrvec_t addrs = m->get_orig_source_addrs();
+ mds_gid_t gid = m->get_global_id();
+ MDSMap::DaemonState state = m->get_state();
+ version_t seq = m->get_seq();
+
+ auto &pending = get_pending_fsmap_writeable();
+
+ dout(15) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
+
+ // Calculate deltas of health metrics created and removed
+ // Do this by type rather than MDSHealthMetric equality, because messages can
+ // change a lot when they include e.g. a number of items.
+ const auto &old_health = pending_daemon_health[gid].metrics;
+ const auto &new_health = m->get_health().metrics;
+
+ set<mds_metric_t> old_types;
+ for (const auto &i : old_health) {
+ old_types.insert(i.type);
+ }
+
+ set<mds_metric_t> new_types;
+ for (const auto &i : new_health) {
+ if (i.type == MDS_HEALTH_DUMMY) {
+ continue;
+ }
+ new_types.insert(i.type);
+ }
+
+ for (const auto &new_metric: new_health) {
+ if (new_metric.type == MDS_HEALTH_DUMMY) {
+ continue;
+ }
+ if (old_types.count(new_metric.type) == 0) {
+ dout(10) << "MDS health message (" << m->get_orig_source()
+ << "): " << new_metric.sev << " " << new_metric.message << dendl;
+ }
+ }
+
+ // Log the disappearance of health messages at INFO
+ for (const auto &old_metric : old_health) {
+ if (new_types.count(old_metric.type) == 0) {
+ mon.clog->info() << "MDS health message cleared ("
+ << m->get_orig_source() << "): " << old_metric.message;
+ }
+ }
+
+ // Store health
+ pending_daemon_health[gid] = m->get_health();
+
+ const auto& cs = m->get_compat();
+ if (state == MDSMap::STATE_BOOT) {
+ // zap previous instance of this name?
+ if (g_conf()->mds_enforce_unique_name) {
+ bool failed_mds = false;
+ while (mds_gid_t existing = pending.find_mds_gid_by_name(m->get_name())) {
+ if (!mon.osdmon()->is_writeable()) {
+ mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return false;
+ }
+ const auto& existing_info = pending.get_info_gid(existing);
+ mon.clog->info() << existing_info.human_name() << " restarted";
+ fail_mds_gid(pending, existing);
+ failed_mds = true;
+ }
+ if (failed_mds) {
+ ceph_assert(mon.osdmon()->is_writeable());
+ request_proposal(mon.osdmon());
+ }
+ }
+
+ // Add this daemon to the map
+ if (!pending.gid_exists(gid)) {
+ MDSMap::mds_info_t new_info;
+ new_info.global_id = gid;
+ new_info.name = m->get_name();
+ new_info.addrs = addrs;
+ new_info.mds_features = m->get_mds_features();
+ new_info.state = MDSMap::STATE_STANDBY;
+ new_info.state_seq = seq;
+ new_info.compat = cs;
+ if (m->get_fs().size()) {
+ fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+ auto* fsp = pending.get_filesystem(m->get_fs());
+ if (fsp) {
+ fscid = fsp->get_fscid();
+ }
+ new_info.join_fscid = fscid;
+ }
+ pending.insert(new_info);
+ }
+
+ // initialize the beacon timer
+ auto &beacon = last_beacon[gid];
+ beacon.stamp = mono_clock::now();
+ beacon.seq = seq;
+
+ update_metadata(m->get_global_id(), m->get_sys_info());
+ } else {
+ // state update
+
+ if (!pending.gid_exists(gid)) {
+ /* gid has been removed from pending, send null map */
+ dout(5) << "mds_beacon " << *m << " is not in fsmap (state "
+ << ceph_mds_state_name(state) << ")" << dendl;
+
+ /* We can't send an MDSMap this MDS was a part of because we no longer
+ * know which FS it was part of. Nor does this matter. Sending an empty
+ * MDSMap is sufficient for getting the MDS to respawn.
+ */
+ goto null;
+ }
+
+ const auto& info = pending.get_info_gid(gid);
+
+ // did the reported compat change? That's illegal!
+ if (cs.compare(info.compat) != 0) {
+ if (!mon.osdmon()->is_writeable()) {
+ mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return false;
+ }
+ mon.clog->warn() << info.human_name() << " compat changed unexpectedly";
+ fail_mds_gid(pending, gid);
+ request_proposal(mon.osdmon());
+ return true;
+ }
+
+ if (state == MDSMap::STATE_DNE) {
+ dout(1) << __func__ << ": DNE from " << info << dendl;
+ goto evict;
+ }
+
+ // legal state change?
+ if ((info.state == MDSMap::STATE_STANDBY && state != info.state) ||
+ (info.state == MDSMap::STATE_STANDBY_REPLAY && state != info.state && state != MDSMap::STATE_DAMAGED)) {
+ // Standby daemons should never modify their own state.
+ // Except that standby-replay can indicate the rank is damaged due to failure to replay.
+ // Reject any attempts to do so.
+ derr << "standby " << gid << " attempted to change state to "
+ << ceph_mds_state_name(state) << ", rejecting" << dendl;
+ goto evict;
+ } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
+ !MDSMap::state_transition_valid(info.state, state)) {
+ // Validate state transitions for daemons that hold a rank
+ derr << "daemon " << gid << " (rank " << info.rank << ") "
+ << "reported invalid state transition "
+ << ceph_mds_state_name(info.state) << " -> "
+ << ceph_mds_state_name(state) << dendl;
+ goto evict;
+ }
+
+ if (info.laggy()) {
+ dout(1) << "prepare_beacon clearing laggy flag on " << addrs << dendl;
+ pending.modify_daemon(info.global_id, [](auto& info)
+ {
+ info.clear_laggy();
+ }
+ );
+ }
+
+ dout(5) << "prepare_beacon mds." << info.rank
+ << " " << ceph_mds_state_name(info.state)
+ << " -> " << ceph_mds_state_name(state)
+ << dendl;
+
+ fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+ if (m->get_fs().size()) {
+ auto* fsp = pending.get_filesystem(m->get_fs());
+ if (fsp) {
+ fscid = fsp->get_fscid();
+ }
+ }
+ pending.modify_daemon(gid, [fscid](auto& info) {
+ info.join_fscid = fscid;
+ });
+
+ if (state == MDSMap::STATE_STOPPED) {
+ const auto fscid = pending.fscid_from_gid(gid);
+ const auto& fs = pending.get_filesystem(fscid);
+
+ mon.clog->info() << info.human_name() << " finished "
+ << "stopping rank " << info.rank << " in filesystem "
+ << fs.get_mds_map().fs_name << " (now has "
+ << fs.get_mds_map().get_num_in_mds() - 1 << " ranks)";
+
+ auto erased = pending.stop(gid);
+ erased.push_back(gid);
+
+ for (const auto& erased_gid : erased) {
+ last_beacon.erase(erased_gid);
+ if (pending_daemon_health.count(erased_gid)) {
+ pending_daemon_health.erase(erased_gid);
+ pending_daemon_health_rm.insert(erased_gid);
+ }
+ }
+ } else if (state == MDSMap::STATE_DAMAGED) {
+ if (!mon.osdmon()->is_writeable()) {
+ dout(1) << __func__ << ": DAMAGED from rank " << info.rank
+ << " waiting for osdmon writeable to blocklist it" << dendl;
+ mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return false;
+ }
+
+ auto rank = info.rank;
+
+ // Record this MDS rank as damaged, so that other daemons
+ // won't try to run it.
+ dout(0) << __func__ << ": marking rank " << rank << " damaged" << dendl;
+
+ auto& fs = pending.get_filesystem(gid);
+ auto rankgid = fs.get_mds_map().get_gid(rank);
+ auto rankinfo = pending.get_info_gid(rankgid);
+ auto followergid = fs.get_mds_map().get_standby_replay(rank);
+
+ ceph_assert(gid == rankgid || gid == followergid);
+
+ utime_t until = ceph_clock_now();
+ until += g_conf().get_val<double>("mon_mds_blocklist_interval");
+ const auto blocklist_epoch = mon.osdmon()->blocklist(rankinfo.addrs, until);
+ if (followergid != MDS_GID_NONE) {
+ fail_mds_gid(pending, followergid);
+ last_beacon.erase(followergid);
+ }
+ request_proposal(mon.osdmon());
+ force_immediate_propose();
+ pending.damaged(rankgid, blocklist_epoch);
+ last_beacon.erase(rankgid);
+
+ /* MDS expects beacon reply back */
+ } else {
+ if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
+ const auto& fscid = pending.fscid_from_gid(gid);
+ const auto& fs = pending.get_filesystem(fscid);
+ mon.clog->info() << info.human_name() << " is now active in "
+ << "filesystem " << fs.get_mds_map().fs_name << " as rank "
+ << info.rank;
+ }
+
+ // Made it through special cases and validations, record the
+ // daemon's reported state to the FSMap.
+ pending.modify_daemon(gid, [state, seq](auto& info) {
+ info.state = state;
+ info.state_seq = seq;
+ });
+ }
+ }
+
+ dout(5) << "prepare_beacon pending map now:" << dendl;
+ print_map(pending);
+
+ wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
+ if (r >= 0)
+ _updated(op); // success
+ else if (r == -ECANCELED) {
+ mon.no_reply(op);
+ } else {
+ dispatch(op); // try again
+ }
+ }));
+
+ return true;
+
+evict:
+ if (!mon.osdmon()->is_writeable()) {
+ dout(1) << __func__ << ": waiting for writeable OSDMap to evict" << dendl;
+ mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return false;
+ }
+
+ fail_mds_gid(pending, gid);
+ request_proposal(mon.osdmon());
+ dout(5) << __func__ << ": pending map now:" << dendl;
+ print_map(pending);
+
+ goto null;
+
+null:
+ wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
+ if (r >= 0) {
+ auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
+ mon.send_reply(op, m.detach());
+ } else {
+ dispatch(op); // try again
+ }
+ }));
+
+ return true;
+}
+
+bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
+{
+ auto &pending = get_pending_fsmap_writeable();
+ bool propose = false;
+
+ op->mark_mdsmon_event(__func__);
+ auto m = op->get_req<MMDSLoadTargets>();
+ mds_gid_t gid = m->global_id;
+ if (pending.gid_has_rank(gid)) {
+ dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
+ pending.update_export_targets(gid, m->targets);
+ propose = true;
+ } else {
+ dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
+ }
+ mon.no_reply(op);
+ return propose;
+}
+
+bool MDSMonitor::should_propose(double& delay)
+{
+ // delegate to PaxosService to assess whether we should propose
+ return PaxosService::should_propose(delay);
+}
+
+void MDSMonitor::_updated(MonOpRequestRef op)
+{
+ const auto &fsmap = get_fsmap();
+ op->mark_mdsmon_event(__func__);
+ auto m = op->get_req<MMDSBeacon>();
+ dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
+ mon.clog->debug() << m->get_orig_source() << " "
+ << m->get_orig_source_addrs() << " "
+ << ceph_mds_state_name(m->get_state());
+
+ if (m->get_state() == MDSMap::STATE_STOPPED) {
+ // send the map manually (they're out of the map, so they won't get it automatic)
+ auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
+ mon.send_reply(op, m.detach());
+ } else {
+ auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
+ m->get_global_id(), m->get_name(), fsmap.get_epoch(),
+ m->get_state(), m->get_seq(), CEPH_FEATURES_SUPPORTED_DEFAULT);
+ mon.send_reply(op, beacon.detach());
+ }
+}
+
+void MDSMonitor::on_active()
+{
+ tick();
+
+ if (is_leader()) {
+ mon.clog->debug() << "fsmap " << get_fsmap();
+ }
+}
+
+void MDSMonitor::dump_info(Formatter *f)
+{
+ f->open_object_section("fsmap");
+ get_fsmap().dump(f);
+ f->close_section();
+
+ f->dump_unsigned("mdsmap_first_committed", get_first_committed());
+ f->dump_unsigned("mdsmap_last_committed", get_last_committed());
+}
+
+bool MDSMonitor::preprocess_command(MonOpRequestRef op)
+{
+ op->mark_mdsmon_event(__func__);
+ auto m = op->get_req<MMonCommand>();
+ int r = -1;
+ bufferlist rdata;
+ stringstream ss, ds;
+
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ // ss has reason for failure
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+ return true;
+ }
+
+ string prefix;
+ cmd_getval(cmdmap, "prefix", prefix);
+ string format = cmd_getval_or<string>(cmdmap, "format", "plain");
+ std::unique_ptr<Formatter> f(Formatter::create(format));
+
+ MonSession *session = op->get_session();
+ if (!session) {
+ mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
+ return true;
+ }
+
+ // to use const qualifier filter fsmap beforehand
+ FSMap _fsmap_copy = get_fsmap();
+ _fsmap_copy.filter(session->get_allowed_fs_names());
+ const auto& fsmap = _fsmap_copy;
+
+ if (prefix == "mds stat") {
+ if (f) {
+ f->open_object_section("mds_stat");
+ dump_info(f.get());
+ f->close_section();
+ f->flush(ds);
+ } else {
+ ds << fsmap;
+ }
+ r = 0;
+ } else if (prefix == "mds last-seen") {
+ std::string id;
+ cmd_getval(cmdmap, "id", id);
+
+ dout(10) << "last seen check for " << id << dendl;
+
+ auto& history = get_fsmap_history();
+ auto now = real_clock::now();
+ bool found = false;
+ /* Special case:
+ * If the mons consider the MDS "in" the latest FSMap, then the mds
+ * is always "last seen" **now** (for the purposes of this API). We
+ * don't look at past beacons because that is only managed by the
+ * leader and the logic is fudged in places in the event of suspected
+ * network partitions.
+ */
+ std::chrono::seconds since = std::chrono::seconds(0);
+
+ for (auto& [epoch, fsmaph] : boost::adaptors::reverse(history)) {
+ dout(25) << "looking at epoch " << epoch << dendl;
+ auto* info = fsmaph.find_by_name(id);
+ if (info) {
+ dout(10) << "found: " << *info << dendl;
+ found = true;
+ if (f) {
+ f->open_object_section("mds last-seen");
+ f->dump_object("info", *info);
+ f->dump_string("last-seen", fmt::format("{}", since));
+ f->dump_int("epoch", epoch);
+ f->close_section();
+ f->flush(ds);
+ } else {
+ ds << fmt::format("{}", since);
+ }
+ break;
+ }
+ /* If the MDS appears in the next epoch, then it went away as of this epoch's btime.
+ */
+ since = std::chrono::duration_cast<std::chrono::seconds>(now - fsmaph.get_btime());
+ }
+ if (found) {
+ r = 0;
+ } else {
+ ss << "mds " << id << " not found in recent FSMaps";
+ r = -ENOENT;
+ }
+ } else if (prefix == "mds ok-to-stop") {
+ vector<string> ids;
+ if (!cmd_getval(cmdmap, "ids", ids)) {
+ r = -EINVAL;
+ ss << "must specify mds id";
+ goto out;
+ }
+ if (fsmap.is_any_degraded()) {
+ ss << "one or more filesystems is currently degraded";
+ r = -EBUSY;
+ goto out;
+ }
+ set<mds_gid_t> stopping;
+ for (auto& id : ids) {
+ ostringstream ess;
+ mds_gid_t gid = gid_from_arg(fsmap, id, ess);
+ if (gid == MDS_GID_NONE) {
+ // the mds doesn't exist, but no file systems are unhappy, so losing it
+ // can't have any effect.
+ continue;
+ }
+ stopping.insert(gid);
+ }
+ set<mds_gid_t> active;
+ set<mds_gid_t> standby;
+ for (auto gid : stopping) {
+ if (fsmap.gid_has_rank(gid)) {
+ // ignore standby-replay daemons (at this level)
+ if (!fsmap.is_standby_replay(gid)) {
+ auto standby = fsmap.get_standby_replay(gid);
+ if (standby == MDS_GID_NONE ||
+ stopping.count(standby)) {
+ // no standby-replay, or we're also stopping the standby-replay
+ // for this mds
+ active.insert(gid);
+ }
+ }
+ } else {
+ // net loss of a standby
+ standby.insert(gid);
+ }
+ }
+ if (fsmap.get_num_standby() - standby.size() < active.size()) {
+ r = -EBUSY;
+ ss << "insufficent standby MDS daemons to stop active gids "
+ << stringify(active)
+ << " and/or standby gids " << stringify(standby);;
+ goto out;
+ }
+ r = 0;
+ ss << "should be safe to stop " << ids;
+ } else if (prefix == "fs dump") {
+ int64_t epocharg;
+ epoch_t epoch;
+
+ const FSMap *fsmapp = &fsmap;
+ FSMap dummy;
+ if (cmd_getval(cmdmap, "epoch", epocharg)) {
+ epoch = epocharg;
+ bufferlist b;
+ int err = get_version(epoch, b);
+ if (err == -ENOENT) {
+ r = -ENOENT;
+ goto out;
+ } else {
+ ceph_assert(err == 0);
+ ceph_assert(b.length());
+ dummy.decode(b);
+ fsmapp = &dummy;
+ }
+ }
+
+ stringstream ds;
+ if (f != NULL) {
+ f->open_object_section("fsmap");
+ fsmapp->dump(f.get());
+ f->close_section();
+ f->flush(ds);
+ r = 0;
+ } else {
+ fsmapp->print(ds);
+ r = 0;
+ }
+
+ rdata.append(ds);
+ ss << "dumped fsmap epoch " << fsmapp->get_epoch();
+ } else if (prefix == "mds metadata") {
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+
+ string who;
+ bool all = !cmd_getval(cmdmap, "who", who);
+ dout(1) << "all = " << all << dendl;
+ if (all) {
+ r = 0;
+ // Dump all MDSs' metadata
+ const auto all_info = fsmap.get_mds_info();
+
+ f->open_array_section("mds_metadata");
+ for(const auto &i : all_info) {
+ const auto &info = i.second;
+
+ f->open_object_section("mds");
+ f->dump_string("name", info.name);
+ ostringstream get_err;
+ r = dump_metadata(fsmap, info.name, f.get(), get_err);
+ if (r == -EINVAL || r == -ENOENT) {
+ // Drop error, list what metadata we do have
+ dout(1) << get_err.str() << dendl;
+ r = 0;
+ } else if (r != 0) {
+ derr << "Unexpected error reading metadata: " << cpp_strerror(r)
+ << dendl;
+ ss << get_err.str();
+ f->close_section();
+ break;
+ }
+ f->close_section();
+ }
+ f->close_section();
+ } else {
+ // Dump a single daemon's metadata
+ f->open_object_section("mds_metadata");
+ r = dump_metadata(fsmap, who, f.get(), ss);
+ f->close_section();
+ }
+ f->flush(ds);
+ } else if (prefix == "mds versions") {
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ count_metadata("ceph_version", f.get());
+ f->flush(ds);
+ r = 0;
+ } else if (prefix == "mds count-metadata") {
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ string field;
+ cmd_getval(cmdmap, "property", field);
+ count_metadata(field, f.get());
+ f->flush(ds);
+ r = 0;
+ } else if (prefix == "fs compat show") {
+ string fs_name;
+ cmd_getval(cmdmap, "fs_name", fs_name);
+ auto* fsp = fsmap.get_filesystem(fs_name);
+ if (fsp == nullptr) {
+ ss << "filesystem '" << fs_name << "' not found";
+ r = -ENOENT;
+ goto out;
+ }
+
+ if (f) {
+ f->open_object_section("mds_compat");
+ fsp->get_mds_map().compat.dump(f.get());
+ f->close_section();
+ f->flush(ds);
+ } else {
+ ds << fsp->get_mds_map().compat;
+ }
+ r = 0;
+ } else if (prefix == "mds compat show") {
+ if (f) {
+ f->open_object_section("mds_compat");
+ fsmap.get_default_compat().dump(f.get());
+ f->close_section();
+ f->flush(ds);
+ } else {
+ ds << fsmap.get_default_compat();
+ }
+ r = 0;
+ } else if (prefix == "fs get") {
+ string fs_name;
+ cmd_getval(cmdmap, "fs_name", fs_name);
+ auto* fsp = fsmap.get_filesystem(fs_name);
+ if (fsp == nullptr) {
+ ss << "filesystem '" << fs_name << "' not found";
+ r = -ENOENT;
+ } else {
+ if (f != nullptr) {
+ f->open_object_section("filesystem");
+ fsp->dump(f.get());
+ f->close_section();
+ f->flush(ds);
+ r = 0;
+ } else {
+ fsp->print(ds);
+ r = 0;
+ }
+ }
+ } else if (prefix == "fs ls") {
+ if (f) {
+ f->open_array_section("filesystems");
+ for (const auto& [fscid, fs] : fsmap) {
+ f->open_object_section("filesystem");
+ {
+ const MDSMap &mds_map = fs.get_mds_map();
+ f->dump_string("name", mds_map.fs_name);
+ /* Output both the names and IDs of pools, for use by
+ * humans and machines respectively */
+ f->dump_string("metadata_pool", mon.osdmon()->osdmap.get_pool_name(
+ mds_map.metadata_pool));
+ f->dump_int("metadata_pool_id", mds_map.metadata_pool);
+ f->open_array_section("data_pool_ids");
+ for (const auto &id : mds_map.data_pools) {
+ f->dump_int("data_pool_id", id);
+ }
+ f->close_section();
+
+ f->open_array_section("data_pools");
+ for (const auto &id : mds_map.data_pools) {
+ const auto &name = mon.osdmon()->osdmap.get_pool_name(id);
+ f->dump_string("data_pool", name);
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ f->flush(ds);
+ } else {
+ for (const auto& [fscid, fs] : fsmap) {
+ const MDSMap &mds_map = fs.get_mds_map();
+ const string &md_pool_name = mon.osdmon()->osdmap.get_pool_name(
+ mds_map.metadata_pool);
+
+ ds << "name: " << mds_map.fs_name << ", metadata pool: "
+ << md_pool_name << ", data pools: [";
+ for (const auto &id : mds_map.data_pools) {
+ const string &pool_name = mon.osdmon()->osdmap.get_pool_name(id);
+ ds << pool_name << " ";
+ }
+ ds << "]" << std::endl;
+ }
+
+ if (fsmap.filesystem_count() == 0) {
+ ds << "No filesystems enabled" << std::endl;
+ }
+ }
+ r = 0;
+ } else if (prefix == "fs feature ls") {
+ if (f) {
+ f->open_array_section("cephfs_features");
+ for (size_t i = 0; i <= CEPHFS_FEATURE_MAX; ++i) {
+ f->open_object_section("feature");
+ f->dump_int("index", i);
+ f->dump_string("name", cephfs_feature_name(i));
+ f->close_section();
+ }
+ f->close_section();
+ f->flush(ds);
+ } else {
+ for (size_t i = 0; i <= CEPHFS_FEATURE_MAX; ++i) {
+ ds << i << " " << cephfs_feature_name(i) << std::endl;
+ }
+ }
+ r = 0;
+ } else if (prefix == "fs lsflags") {
+ string fs_name;
+ cmd_getval(cmdmap, "fs_name", fs_name);
+ auto* fsp = fsmap.get_filesystem(fs_name);
+ if (!fsp) {
+ ss << "filesystem '" << fs_name << "' not found";
+ r = -ENOENT;
+ } else {
+ auto& mds_map = fsp->get_mds_map();
+ if (f) {
+ mds_map.dump_flags_state(f.get());
+ f->flush(ds);
+ }
+ else {
+ mds_map.print_flags(ds);
+ }
+ r = 0;
+ }
+ }
+
+out:
+ if (r != -1) {
+ rdata.append(ds);
+ string rs;
+ getline(ss, rs);
+ mon.reply_command(op, r, rs, rdata, get_last_committed());
+ return true;
+ } else
+ return false;
+}
+
+bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid)
+{
+ const auto& info = fsmap.get_info_gid(gid);
+ dout(1) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
+
+ ceph_assert(mon.osdmon()->is_writeable());
+
+ epoch_t blocklist_epoch = 0;
+ if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
+ utime_t until = ceph_clock_now();
+ until += g_conf().get_val<double>("mon_mds_blocklist_interval");
+ blocklist_epoch = mon.osdmon()->blocklist(info.addrs, until);
+ /* do not delay when we are evicting an MDS */
+ force_immediate_propose();
+ }
+
+ fsmap.erase(gid, blocklist_epoch);
+ last_beacon.erase(gid);
+ if (pending_daemon_health.count(gid)) {
+ pending_daemon_health.erase(gid);
+ pending_daemon_health_rm.insert(gid);
+ }
+
+ return blocklist_epoch != 0;
+}
+
+mds_gid_t MDSMonitor::gid_from_arg(const FSMap &fsmap, const string &arg, ostream &ss)
+{
+ // Try parsing as a role
+ mds_role_t role;
+ ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
+ int r = fsmap.parse_role(arg, &role, ignore_err);
+ if (r == 0) {
+ // See if a GID is assigned to this role
+ auto& fs = fsmap.get_filesystem(role.fscid);
+ //ceph_assert(fs != nullptr); // parse_role ensures it exists FIXME ???
+ if (fs.get_mds_map().is_up(role.rank)) {
+ dout(10) << __func__ << ": validated rank/GID " << role
+ << " as a rank" << dendl;
+ return fs.get_mds_map().get_mds_info(role.rank).global_id;
+ }
+ }
+
+ // Try parsing as a gid
+ string err;
+ unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
+ if (!err.empty()) {
+ // Not a role or a GID, try as a daemon name
+ const MDSMap::mds_info_t *mds_info = fsmap.find_by_name(arg);
+ if (!mds_info) {
+ ss << "MDS named '" << arg
+ << "' does not exist, or is not up";
+ return MDS_GID_NONE;
+ }
+ dout(10) << __func__ << ": resolved MDS name '" << arg
+ << "' to GID " << mds_info->global_id << dendl;
+ return mds_info->global_id;
+ } else {
+ // Not a role, but parses as a an integer, might be a GID
+ dout(10) << __func__ << ": treating MDS reference '" << arg
+ << "' as an integer " << maybe_gid << dendl;
+
+ if (fsmap.gid_exists(mds_gid_t(maybe_gid))) {
+ return mds_gid_t(maybe_gid);
+ }
+ }
+
+ dout(1) << __func__ << ": rank/GID " << arg
+ << " not a existent rank or GID" << dendl;
+ return MDS_GID_NONE;
+}
+
+int MDSMonitor::fail_mds(FSMap &fsmap, ostream &ss,
+ const string &arg, MDSMap::mds_info_t *failed_info)
+{
+ ceph_assert(failed_info != nullptr);
+
+ mds_gid_t gid = gid_from_arg(fsmap, arg, ss);
+ if (gid == MDS_GID_NONE) {
+ return 0;
+ }
+ if (!mon.osdmon()->is_writeable()) {
+ return -EAGAIN;
+ }
+
+ // Take a copy of the info before removing the MDS from the map,
+ // so that the caller knows which mds (if any) they ended up removing.
+ *failed_info = fsmap.get_info_gid(gid);
+
+ fail_mds_gid(fsmap, gid);
+ ss << "failed mds gid " << gid;
+ ceph_assert(mon.osdmon()->is_writeable());
+ request_proposal(mon.osdmon());
+ return 0;
+}
+
+bool MDSMonitor::prepare_command(MonOpRequestRef op)
+{
+ op->mark_mdsmon_event(__func__);
+ auto m = op->get_req<MMonCommand>();
+ int r = -EINVAL;
+ stringstream ss;
+ bufferlist rdata;
+
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+ return false;
+ }
+
+ string prefix;
+ cmd_getval(cmdmap, "prefix", prefix);
+
+ /* Refuse access if message not associated with a valid session */
+ MonSession *session = op->get_session();
+ if (!session) {
+ mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
+ return false;
+ }
+
+ auto &pending = get_pending_fsmap_writeable();
+
+ for (const auto &h : handlers) {
+ r = h->can_handle(prefix, op, pending, cmdmap, ss);
+ if (r == 1) {
+ ; // pass, since we got the right handler.
+ } else if (r == 0) {
+ continue;
+ } else {
+ goto out;
+ }
+
+ r = h->handle(&mon, pending, op, cmdmap, ss);
+
+ if (r == -EAGAIN) {
+ // message has been enqueued for retry; return.
+ dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
+ return false;
+ } else {
+ if (r == 0) {
+ // On successful updates, print the updated map
+ print_map(pending);
+ }
+ // Successful or not, we're done: respond.
+ goto out;
+ }
+ }
+
+ r = filesystem_command(pending, op, prefix, cmdmap, ss);
+ if (r >= 0) {
+ goto out;
+ } else if (r == -EAGAIN) {
+ // Do not reply, the message has been enqueued for retry
+ dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl;
+ return false;
+ } else if (r != -ENOSYS) {
+ goto out;
+ }
+
+ if (r == -ENOSYS && ss.str().empty()) {
+ ss << "unrecognized command";
+ }
+
+out:
+ dout(4) << __func__ << " done, r=" << r << dendl;
+ /* Compose response */
+ string rs = ss.str();
+
+ if (r >= 0) {
+ // success.. delay reply
+ wait_for_commit(op, new Monitor::C_Command(mon, op, r, rs,
+ get_last_committed() + 1));
+ return true;
+ } else {
+ // reply immediately
+ mon.reply_command(op, r, rs, rdata, get_last_committed());
+ return false;
+ }
+}
+
+bool MDSMonitor::has_health_warnings(vector<mds_metric_t> warnings)
+{
+ for (auto& [gid, health] : pending_daemon_health) {
+ for (auto& metric : health.metrics) {
+ // metric.type here is the type of health warning. We are only
+ // looking for types of health warnings passed to this func member
+ // through variable "warnings".
+ auto it = std::find(warnings.begin(), warnings.end(), metric.type);
+ if (it != warnings.end()) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+int MDSMonitor::filesystem_command(
+ FSMap &fsmap,
+ MonOpRequestRef op,
+ string const &prefix,
+ const cmdmap_t& cmdmap,
+ stringstream &ss)
+{
+ dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
+ op->mark_mdsmon_event(__func__);
+ int r = 0;
+ string whostr;
+ cmd_getval(cmdmap, "role", whostr);
+
+ if (prefix == "mds set_state") {
+ mds_gid_t gid;
+ if (!cmd_getval(cmdmap, "gid", gid)) {
+ ss << "error parsing 'gid' value '"
+ << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
+ return -EINVAL;
+ }
+ MDSMap::DaemonState state;
+ if (!cmd_getval(cmdmap, "state", state)) {
+ ss << "error parsing 'state' string value '"
+ << cmd_vartype_stringify(cmdmap.at("state")) << "'";
+ return -EINVAL;
+ }
+ if (fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
+ fsmap.modify_daemon(gid, [state](auto& info) {
+ info.state = state;
+ });
+ ss << "set mds gid " << gid << " to state " << state << " "
+ << ceph_mds_state_name(state);
+ return 0;
+ }
+ } else if (prefix == "mds fail") {
+ string who;
+ cmd_getval(cmdmap, "role_or_gid", who);
+ bool confirm = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
+
+ MDSMap::mds_info_t failed_info;
+ mds_gid_t gid = gid_from_arg(fsmap, who, ss);
+ if (gid == MDS_GID_NONE) {
+ ss << "MDS named '" << who << "' does not exist, is not up or you "
+ << "lack the permission to see.";
+ return 0;
+ }
+ if(!fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
+ ss << "MDS named '" << who << "' does not exist, is not up or you "
+ << "lack the permission to see.";
+ return -EINVAL;
+ }
+ string_view fs_name = fsmap.fs_name_from_gid(gid);
+ if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
+ ss << "Permission denied.";
+ return -EPERM;
+ }
+
+ if (!confirm &&
+ has_health_warnings({MDS_HEALTH_TRIM, MDS_HEALTH_CACHE_OVERSIZED})) {
+ ss << errmsg_for_unhealthy_mds;
+ return -EPERM;
+ }
+
+ r = fail_mds(fsmap, ss, who, &failed_info);
+ if (r < 0 && r == -EAGAIN) {
+ mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return -EAGAIN; // don't propose yet; wait for message to be retried
+ } else if (r == 0) {
+ // Only log if we really did something (not when was already gone)
+ if (failed_info.global_id != MDS_GID_NONE) {
+ mon.clog->info() << failed_info.human_name() << " marked failed by "
+ << op->get_session()->entity_name;
+ }
+ }
+ } else if (prefix == "mds rm") {
+ mds_gid_t gid;
+ if (!cmd_getval(cmdmap, "gid", gid)) {
+ ss << "error parsing 'gid' value '"
+ << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
+ return -EINVAL;
+ }
+ if (!fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
+ ss << "mds gid " << gid << " does not exist";
+ return 0;
+ }
+ string_view fs_name = fsmap.fs_name_from_gid(gid);
+ if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
+ ss << "Permission denied.";
+ return -EPERM;
+ }
+ const auto &info = fsmap.get_info_gid(gid);
+ MDSMap::DaemonState state = info.state;
+ if (state > 0) {
+ ss << "cannot remove active mds." << info.name
+ << " rank " << info.rank;
+ return -EBUSY;
+ } else {
+ fsmap.erase(gid, {});
+ ss << "removed mds gid " << gid;
+ return 0;
+ }
+ } else if (prefix == "mds rmfailed") {
+ bool confirm = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
+ if (!confirm) {
+ ss << "WARNING: this can make your filesystem inaccessible! "
+ "Add --yes-i-really-mean-it if you are sure you wish to continue.";
+ return -EPERM;
+ }
+
+ string role_str;
+ cmd_getval(cmdmap, "role", role_str);
+ mds_role_t role;
+ const auto fs_names = op->get_session()->get_allowed_fs_names();
+ int r = fsmap.parse_role(role_str, &role, ss, fs_names);
+ if (r < 0) {
+ ss << "invalid role '" << role_str << "'";
+ return -EINVAL;
+ }
+ string_view fs_name = fsmap.get_filesystem(role.fscid).get_mds_map().get_fs_name();
+ if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
+ ss << "Permission denied.";
+ return -EPERM;
+ }
+
+ fsmap.modify_filesystem(
+ role.fscid,
+ [role](auto&& fs)
+ {
+ fs.get_mds_map().failed.erase(role.rank);
+ });
+
+ ss << "removed failed mds." << role;
+ return 0;
+ /* TODO: convert to fs commands to update defaults */
+ } else if (prefix == "mds compat rm_compat") {
+ int64_t f;
+ if (!cmd_getval(cmdmap, "feature", f)) {
+ ss << "error parsing feature value '"
+ << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
+ return -EINVAL;
+ }
+ auto& default_compat = fsmap.get_default_compat();
+ if (default_compat.compat.contains(f)) {
+ ss << "removing compat feature " << f;
+ default_compat.compat.remove(f);
+ } else {
+ ss << "compat feature " << f << " not present in " << default_compat;
+ }
+ r = 0;
+ } else if (prefix == "mds compat rm_incompat") {
+ int64_t f;
+ if (!cmd_getval(cmdmap, "feature", f)) {
+ ss << "error parsing feature value '"
+ << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
+ return -EINVAL;
+ }
+ auto& default_compat = fsmap.get_default_compat();
+ if (default_compat.incompat.contains(f)) {
+ ss << "removing incompat feature " << f;
+ default_compat.incompat.remove(f);
+ } else {
+ ss << "incompat feature " << f << " not present in " << default_compat;
+ }
+ r = 0;
+ } else if (prefix == "mds repaired") {
+ string role_str;
+ cmd_getval(cmdmap, "role", role_str);
+ mds_role_t role;
+ const auto fs_names = op->get_session()->get_allowed_fs_names();
+ r = fsmap.parse_role(role_str, &role, ss, fs_names);
+ if (r < 0) {
+ return r;
+ }
+ string_view fs_name = fsmap.get_filesystem(role.fscid).get_mds_map().get_fs_name();
+ if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
+ ss << "Permission denied.";
+ return -EPERM;
+ }
+
+ bool modified = fsmap.undamaged(role.fscid, role.rank);
+ if (modified) {
+ ss << "repaired: restoring rank " << role;
+ } else {
+ ss << "nothing to do: rank is not damaged";
+ }
+
+ r = 0;
+ } else if (prefix == "mds freeze") {
+ string who;
+ cmd_getval(cmdmap, "role_or_gid", who);
+ mds_gid_t gid = gid_from_arg(fsmap, who, ss);
+ if (gid == MDS_GID_NONE) {
+ return -EINVAL;
+ }
+
+ string_view fs_name = fsmap.fs_name_from_gid(gid);
+ if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
+ ss << "Permission denied.";
+ return -EPERM;
+ }
+
+ bool freeze = false;
+ {
+ string str;
+ cmd_getval(cmdmap, "val", str);
+ if ((r = parse_bool(str, &freeze, ss)) != 0) {
+ return r;
+ }
+ }
+
+ auto f = [freeze,gid,&ss](auto& info) {
+ if (freeze) {
+ ss << "freezing mds." << gid;
+ info.freeze();
+ } else {
+ ss << "unfreezing mds." << gid;
+ info.unfreeze();
+ }
+ };
+ fsmap.modify_daemon(gid, f);
+ r = 0;
+ } else {
+ return -ENOSYS;
+ }
+
+ return r;
+}
+
+void MDSMonitor::check_subs()
+{
+ // Subscriptions may be to "mdsmap" (MDS and legacy clients),
+ // "mdsmap.<namespace>", or to "fsmap" for the full state of all
+ // filesystems. Build a list of all the types we service
+ // subscriptions for.
+
+ vector<string> types = {
+ "fsmap",
+ "fsmap.user",
+ "mdsmap",
+ };
+
+ for (const auto& [fscid, fs] : get_fsmap()) {
+ CachedStackStringStream cos;
+ *cos << "mdsmap." << fscid;
+ types.push_back(string(cos->strv()));
+ }
+
+ for (const auto &type : types) {
+ auto& subs = mon.session_map.subs;
+ auto subs_it = subs.find(type);
+ if (subs_it == subs.end())
+ continue;
+ auto sub_it = subs_it->second->begin();
+ while (!sub_it.end()) {
+ auto sub = *sub_it;
+ ++sub_it; // N.B. check_sub may remove sub!
+ check_sub(sub);
+ }
+ }
+}
+
+
+void MDSMonitor::check_sub(Subscription *sub)
+{
+ dout(20) << __func__ << ": " << sub->type << dendl;
+
+ // to use const qualifier filter fsmap beforehand
+ FSMap _fsmap_copy = get_fsmap();
+ _fsmap_copy.filter(sub->session->get_allowed_fs_names());
+ const auto& fsmap = _fsmap_copy;
+ if (sub->next > fsmap.get_epoch()) {
+ return;
+ }
+
+ if (sub->type == "fsmap") {
+ sub->session->con->send_message(new MFSMap(mon.monmap->fsid, fsmap));
+ if (sub->onetime) {
+ mon.session_map.remove_sub(sub);
+ } else {
+ sub->next = fsmap.get_epoch() + 1;
+ }
+ } else if (sub->type == "fsmap.user") {
+ FSMapUser fsmap_u;
+ fsmap_u.epoch = fsmap.get_epoch();
+ fsmap_u.legacy_client_fscid = fsmap.get_legacy_client_fscid();
+ for (const auto& [fscid, fs] : fsmap) {
+ FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[fscid];
+ fs_info.cid = fscid;
+ fs_info.name = fs.get_mds_map().fs_name;
+ }
+ sub->session->con->send_message(new MFSMapUser(mon.monmap->fsid, fsmap_u));
+ if (sub->onetime) {
+ mon.session_map.remove_sub(sub);
+ } else {
+ sub->next = fsmap.get_epoch() + 1;
+ }
+ } else if (sub->type.compare(0, 6, "mdsmap") == 0) {
+ const bool is_mds = sub->session->name.is_mds();
+ mds_gid_t mds_gid = MDS_GID_NONE;
+ fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+ if (is_mds) {
+ // What (if any) namespace are you assigned to?
+ auto mds_info = fsmap.get_mds_info();
+ for (const auto &p : mds_info) {
+ if (p.second.addrs == sub->session->addrs) {
+ mds_gid = p.first;
+ fscid = fsmap.fscid_from_gid(mds_gid);
+ }
+ }
+ } else {
+ // You're a client. Did you request a particular
+ // namespace?
+ if (sub->type.compare(0, 7, "mdsmap.") == 0) {
+ auto namespace_id_str = sub->type.substr(string("mdsmap.").size());
+ dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
+ string err;
+ fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ // Client asked for a non-existent namespace, send them nothing
+ dout(1) << "Invalid client subscription '" << sub->type
+ << "'" << dendl;
+ return;
+ }
+ } else {
+ // Unqualified request for "mdsmap": give it the one marked
+ // for use by legacy clients.
+ auto legacy_client_fscid = fsmap.get_legacy_client_fscid();
+ if (legacy_client_fscid != FS_CLUSTER_ID_NONE) {
+ fscid = legacy_client_fscid;
+ } else {
+ dout(1) << "Client subscribed for legacy filesystem but "
+ "none is configured" << dendl;
+ return;
+ }
+ }
+ if (!fsmap.filesystem_exists(fscid)) {
+ // Client asked for a non-existent namespace, send them nothing
+ // TODO: something more graceful for when a client has a filesystem
+ // mounted, and the fileysstem is deleted. Add a "shut down you fool"
+ // flag to MMDSMap?
+ dout(1) << "Client subscribed to non-existent namespace '" <<
+ fscid << "'" << dendl;
+ return;
+ }
+ }
+ dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid=" << fscid << dendl;
+
+ // Work out the effective latest epoch
+ const MDSMap *mds_map = nullptr;
+ MDSMap null_map = MDSMap::create_null_mdsmap();
+ if (fscid == FS_CLUSTER_ID_NONE) {
+ // For a client, we should have already dropped out
+ ceph_assert(is_mds);
+
+ auto& standby_daemons = fsmap.get_standby_daemons();
+ auto it = standby_daemons.find(mds_gid);
+ if (it != standby_daemons.end()) {
+ // For an MDS, we need to feed it an MDSMap with its own state in
+ null_map.mds_info[mds_gid] = it->second;
+ null_map.epoch = fsmap.get_standby_epochs().at(mds_gid);
+ } else {
+ null_map.epoch = fsmap.get_epoch();
+ }
+ mds_map = &null_map;
+ } else {
+ // Check the effective epoch
+ mds_map = &fsmap.get_filesystem(fscid).get_mds_map();
+ }
+
+ ceph_assert(mds_map != nullptr);
+ dout(10) << __func__ << " selected MDS map epoch " <<
+ mds_map->epoch << " for namespace " << fscid << " for subscriber "
+ << sub->session->name << " who wants epoch " << sub->next << dendl;
+
+ if (sub->next > mds_map->epoch) {
+ return;
+ }
+ auto msg = make_message<MMDSMap>(mon.monmap->fsid, *mds_map);
+
+ sub->session->con->send_message(msg.detach());
+ if (sub->onetime) {
+ mon.session_map.remove_sub(sub);
+ } else {
+ sub->next = mds_map->get_epoch() + 1;
+ }
+ }
+}
+
+
+void MDSMonitor::update_metadata(mds_gid_t gid,
+ const map<string, string>& metadata)
+{
+ dout(20) << __func__ << ": mds." << gid << ": " << metadata << dendl;
+ if (metadata.empty()) {
+ dout(5) << __func__ << ": mds." << gid << ": no metadata!" << dendl;
+ return;
+ }
+ pending_metadata[gid] = metadata;
+
+ MonitorDBStore::TransactionRef t = paxos.get_pending_transaction();
+ bufferlist bl;
+ encode(pending_metadata, bl);
+ t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
+}
+
+void MDSMonitor::remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t)
+{
+ bool update = false;
+ for (auto it = pending_metadata.begin(); it != pending_metadata.end(); ) {
+ if (!fsmap.gid_exists(it->first)) {
+ it = pending_metadata.erase(it);
+ update = true;
+ } else {
+ ++it;
+ }
+ }
+ if (!update)
+ return;
+ bufferlist bl;
+ encode(pending_metadata, bl);
+ t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
+}
+
+int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
+{
+ bufferlist bl;
+ int r = mon.store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
+ if (r) {
+ dout(5) << "Unable to load 'last_metadata'" << dendl;
+ return r;
+ }
+
+ auto it = bl.cbegin();
+ ceph::decode(m, it);
+ return 0;
+}
+
+void MDSMonitor::count_metadata(const string &field, map<string,int> *out)
+{
+ map<mds_gid_t,Metadata> meta;
+ load_metadata(meta);
+ for (auto& p : meta) {
+ auto q = p.second.find(field);
+ if (q == p.second.end()) {
+ (*out)["unknown"]++;
+ } else {
+ (*out)[q->second]++;
+ }
+ }
+}
+
+void MDSMonitor::count_metadata(const string &field, Formatter *f)
+{
+ map<string,int> by_val;
+ count_metadata(field, &by_val);
+ f->open_object_section(field.c_str());
+ for (auto& p : by_val) {
+ f->dump_int(p.first.c_str(), p.second);
+ }
+ f->close_section();
+}
+
+void MDSMonitor::get_versions(map<string, list<string> > &versions)
+{
+ map<mds_gid_t,Metadata> meta;
+ load_metadata(meta);
+ const auto &fsmap = get_fsmap();
+ map<mds_gid_t, mds_info_t> map = fsmap.get_mds_info();
+ dout(10) << __func__ << " mds meta=" << meta << dendl;
+ for (auto& p : meta) {
+ auto q = p.second.find("ceph_version_short");
+ if (q == p.second.end()) continue;
+ versions[q->second].push_back(string("mds.") + map[p.first].name);
+ }
+}
+
+int MDSMonitor::dump_metadata(const FSMap& fsmap, const string &who,
+ Formatter *f, ostream& err)
+{
+ ceph_assert(f);
+
+ mds_gid_t gid = gid_from_arg(fsmap, who, err);
+ if (gid == MDS_GID_NONE) {
+ return -EINVAL;
+ }
+
+ map<mds_gid_t, Metadata> metadata;
+ if (int r = load_metadata(metadata)) {
+ err << "Unable to load 'last_metadata'";
+ return r;
+ }
+
+ if (!metadata.count(gid)) {
+ return -ENOENT;
+ }
+ const Metadata& m = metadata[gid];
+ for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
+ f->dump_string(p->first.c_str(), p->second);
+ }
+ return 0;
+}
+
+int MDSMonitor::print_nodes(Formatter *f)
+{
+ ceph_assert(f);
+
+ const auto &fsmap = get_fsmap();
+
+ map<mds_gid_t, Metadata> metadata;
+ if (int r = load_metadata(metadata)) {
+ return r;
+ }
+
+ map<string, list<string> > mdses; // hostname => mds
+ for (const auto &p : metadata) {
+ const mds_gid_t& gid = p.first;
+ const Metadata& m = p.second;
+ Metadata::const_iterator hostname = m.find("hostname");
+ if (hostname == m.end()) {
+ // not likely though
+ continue;
+ }
+ if (!fsmap.gid_exists(gid)) {
+ dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
+ continue;
+ }
+ const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
+ mdses[hostname->second].push_back(mds_info.name);
+ }
+
+ dump_services(f, mdses, "mds");
+ return 0;
+}
+
+/**
+ * If a cluster is undersized (with respect to max_mds), then
+ * attempt to find daemons to grow it. If the cluster is oversized
+ * (with respect to max_mds) then shrink it by stopping its highest rank.
+ */
+bool MDSMonitor::maybe_resize_cluster(FSMap &fsmap, const Filesystem& fs)
+{
+ auto fscid = fs.get_fscid();
+ auto& mds_map = fs.get_mds_map();
+
+ int in = mds_map.get_num_in_mds();
+ int max = mds_map.get_max_mds();
+
+ dout(20) << __func__ << " in " << in << " max " << max << dendl;
+
+ /* Check that both the current epoch mds_map is resizeable as well as the
+ * current batch of changes in pending. This is important if an MDS is
+ * becoming active in the next epoch.
+ */
+ if (!get_fsmap().filesystem_exists(fscid) ||
+ !get_fsmap().get_filesystem(fscid).get_mds_map().is_resizeable() ||
+ !mds_map.is_resizeable()) {
+ dout(5) << __func__ << " mds_map is not currently resizeable" << dendl;
+ return false;
+ }
+
+ if (in < max && !mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
+ mds_rank_t mds = mds_rank_t(0);
+ while (mds_map.is_in(mds)) {
+ mds++;
+ }
+ auto info = fsmap.find_replacement_for({fscid, mds});
+ if (!info) {
+ return false;
+ }
+
+ dout(1) << "assigned standby " << info->addrs
+ << " as mds." << mds << dendl;
+ mon.clog->info() << info->human_name() << " assigned to "
+ "filesystem " << mds_map.fs_name << " as rank "
+ << mds << " (now has " << mds_map.get_num_in_mds() + 1
+ << " ranks)";
+ fsmap.promote(info->global_id, fscid, mds);
+ return true;
+ } else if (in > max) {
+ mds_rank_t target = in - 1;
+ const auto &info = mds_map.get_info(target);
+ if (mds_map.is_active(target)) {
+ dout(1) << "stopping " << target << dendl;
+ mon.clog->info() << "stopping " << info.human_name();
+ auto f = [](auto& info) {
+ info.state = MDSMap::STATE_STOPPING;
+ };
+ fsmap.modify_daemon(info.global_id, f);
+ return true;
+ } else {
+ dout(20) << "skipping stop of " << target << dendl;
+ return false;
+ }
+ }
+
+ return false;
+}
+
+
+/**
+ * Fail a daemon and replace it with a suitable standby.
+ */
+bool MDSMonitor::drop_mds(FSMap &fsmap, mds_gid_t gid, const mds_info_t* rep_info, bool *osd_propose)
+{
+ ceph_assert(osd_propose != nullptr);
+
+ const auto fscid = fsmap.fscid_from_gid(gid);
+ const auto& info = fsmap.get_info_gid(gid);
+ const auto rank = info.rank;
+ const auto state = info.state;
+
+ if (info.is_frozen()) {
+ return false;
+ } else if (state == MDSMap::STATE_STANDBY_REPLAY ||
+ state == MDSMap::STATE_STANDBY) {
+ dout(1) << " failing and removing standby " << gid << " " << info.addrs
+ << " mds." << rank
+ << "." << info.inc << " " << ceph_mds_state_name(state)
+ << dendl;
+ *osd_propose |= fail_mds_gid(fsmap, gid);
+ return true;
+ } else if (rank >= 0 && rep_info) {
+ auto& fs = fsmap.get_filesystem(fscid);
+ if (fs.get_mds_map().test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
+ return false;
+ }
+ // are we in?
+ // and is there a non-laggy standby that can take over for us?
+ dout(1) << " replacing " << gid << " " << info.addrs
+ << " mds." << rank << "." << info.inc
+ << " " << ceph_mds_state_name(state)
+ << " with " << rep_info->global_id << "/" << rep_info->name << " " << rep_info->addrs
+ << dendl;
+
+ mon.clog->warn() << "Replacing " << info.human_name()
+ << " as rank " << rank
+ << " with standby " << rep_info->human_name();
+
+ // Remove the old one
+ *osd_propose |= fail_mds_gid(fsmap, gid);
+
+ // Promote the replacement
+ fsmap.promote(rep_info->global_id, fscid, rank);
+
+ return true;
+ }
+ return false;
+}
+
+bool MDSMonitor::check_health(FSMap& fsmap, bool* propose_osdmap)
+{
+ bool do_propose = false;
+ const auto now = mono_clock::now();
+ const bool osdmap_writeable = mon.osdmon()->is_writeable();
+ const auto mds_beacon_grace = g_conf().get_val<double>("mds_beacon_grace");
+ const auto mds_beacon_interval = g_conf().get_val<double>("mds_beacon_interval");
+
+ if (mono_clock::is_zero(last_tick)) {
+ last_tick = now;
+ }
+
+ {
+ auto since_last = std::chrono::duration<double>(now-last_tick);
+
+ if (since_last.count() > (mds_beacon_grace-mds_beacon_interval)) {
+ // This case handles either local slowness (calls being delayed
+ // for whatever reason) or cluster election slowness (a long gap
+ // between calls while an election happened)
+ dout(1) << __func__ << ": resetting beacon timeouts due to mon delay "
+ "(slow election?) of " << since_last.count() << " seconds" << dendl;
+ for (auto& p : last_beacon) {
+ p.second.stamp = now;
+ }
+ }
+ }
+
+ // make sure last_beacon is fully populated
+ for ([[maybe_unused]] const auto& [gid, fscid] : fsmap.get_mds_roles()) {
+ last_beacon.emplace(std::piecewise_construct,
+ std::forward_as_tuple(gid),
+ std::forward_as_tuple(now, 0));
+ }
+
+ // We will only take decisive action (replacing/removing a daemon)
+ // if we have some indication that some other daemon(s) are successfully
+ // getting beacons through recently.
+ mono_time latest_beacon = mono_clock::zero();
+ for (const auto& p : last_beacon) {
+ latest_beacon = std::max(p.second.stamp, latest_beacon);
+ }
+ auto since = std::chrono::duration<double>(now-latest_beacon);
+ const bool may_replace = since.count() <
+ std::max(g_conf()->mds_beacon_interval, g_conf()->mds_beacon_grace * 0.5);
+
+ // check beacon timestamps
+ vector<mds_gid_t> to_remove;
+ const bool mon_down = mon.is_mon_down();
+ const auto mds_beacon_mon_down_grace =
+ g_conf().get_val<std::chrono::seconds>("mds_beacon_mon_down_grace");
+ const auto quorum_age = std::chrono::seconds(mon.quorum_age());
+ const bool new_quorum = quorum_age < mds_beacon_mon_down_grace;
+ for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
+ auto& [gid, beacon_info] = *it;
+ auto since_last = std::chrono::duration<double>(now-beacon_info.stamp);
+
+ if (!fsmap.gid_exists(gid)) {
+ // gid no longer exists, remove from tracked beacons
+ it = last_beacon.erase(it);
+ continue;
+ }
+
+ if (since_last.count() >= g_conf()->mds_beacon_grace) {
+ auto& info = fsmap.get_info_gid(gid);
+ dout(1) << "no beacon from mds." << info.rank << "." << info.inc
+ << " (gid: " << gid << " addr: " << info.addrs
+ << " state: " << ceph_mds_state_name(info.state) << ")"
+ << " since " << since_last.count() << dendl;
+ if ((mon_down || new_quorum) && since_last < mds_beacon_mon_down_grace) {
+ /* The MDS may be sending beacons to a monitor not yet in quorum or
+ * temporarily partitioned. Hold off on removal for a little longer...
+ */
+ dout(10) << "deferring removal for mds_beacon_mon_down_grace during MON_DOWN" << dendl;
+ ++it;
+ continue;
+ }
+ // If the OSDMap is writeable, we can blocklist things, so we can
+ // try failing any laggy MDS daemons. Consider each one for failure.
+ if (!info.laggy()) {
+ dout(1) << " marking " << gid << " " << info.addrs
+ << " mds." << info.rank << "." << info.inc
+ << " " << ceph_mds_state_name(info.state)
+ << " laggy" << dendl;
+ fsmap.modify_daemon(info.global_id, [](auto& info) {
+ info.laggy_since = ceph_clock_now();
+ });
+ do_propose = true;
+ }
+ if (osdmap_writeable && may_replace) {
+ to_remove.push_back(gid); // drop_mds may invalidate iterator
+ }
+ }
+
+ ++it;
+ }
+
+ for (const auto& gid : to_remove) {
+ auto info = fsmap.get_info_gid(gid);
+ const mds_info_t* rep_info = nullptr;
+ if (info.rank >= 0) {
+ auto fscid = fsmap.fscid_from_gid(gid);
+ rep_info = fsmap.find_replacement_for({fscid, info.rank});
+ }
+ bool dropped = drop_mds(fsmap, gid, rep_info, propose_osdmap);
+ if (dropped) {
+ mon.clog->info() << "MDS " << info.human_name()
+ << " is removed because it is dead or otherwise unavailable.";
+ do_propose = true;
+ }
+ }
+
+ if (osdmap_writeable) {
+ for (auto& [fscid, fs] : std::as_const(fsmap)) {
+ if (!fs.get_mds_map().test_flag(CEPH_MDSMAP_NOT_JOINABLE) &&
+ fs.get_mds_map().is_resizeable()) {
+ // Check if a rank or standby-replay should be replaced with a stronger
+ // affinity standby. This looks at ranks and standby-replay:
+ for (const auto& [gid, info] : fs.get_mds_map().get_mds_info()) {
+ const auto join_fscid = info.join_fscid;
+ if (join_fscid == fscid)
+ continue;
+ const auto rank = info.rank;
+ const auto state = info.state;
+ const mds_info_t* rep_info = nullptr;
+ if (state == MDSMap::STATE_STANDBY_REPLAY) {
+ rep_info = fsmap.get_available_standby(fs);
+ } else if (state == MDSMap::STATE_ACTIVE) {
+ rep_info = fsmap.find_replacement_for({fscid, rank});
+ } else {
+ /* N.B. !is_degraded() */
+ ceph_abort_msg("invalid state in MDSMap");
+ }
+ if (!rep_info) {
+ break;
+ }
+ bool better_affinity = false;
+ if (join_fscid == FS_CLUSTER_ID_NONE) {
+ better_affinity = (rep_info->join_fscid == fscid);
+ } else {
+ better_affinity = (rep_info->join_fscid == fscid) ||
+ (rep_info->join_fscid == FS_CLUSTER_ID_NONE);
+ }
+ if (better_affinity) {
+ if (state == MDSMap::STATE_STANDBY_REPLAY) {
+ mon.clog->info() << "Dropping low affinity standby-replay "
+ << info.human_name()
+ << " in favor of higher affinity standby.";
+ *propose_osdmap |= fail_mds_gid(fsmap, gid);
+ /* Now let maybe_promote_standby do the promotion. */
+ } else {
+ mon.clog->info() << "Dropping low affinity active "
+ << info.human_name()
+ << " in favor of higher affinity standby.";
+ do_propose |= drop_mds(fsmap, gid, rep_info, propose_osdmap);
+ }
+ break; /* don't replace more than one per tick per fs */
+ }
+ }
+ }
+ }
+ }
+ return do_propose;
+}
+
+bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, const Filesystem& fs)
+{
+ auto& mds_map = fs.get_mds_map();
+ if (mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
+ return false;
+ }
+
+ bool do_propose = false;
+
+ // have a standby take over?
+ set<mds_rank_t> failed;
+ mds_map.get_failed_mds_set(failed);
+ for (const auto& rank : failed) {
+ auto info = fsmap.find_replacement_for({fs.get_fscid(), rank});
+ if (info) {
+ dout(1) << " taking over failed mds." << rank << " with " << info->global_id
+ << "/" << info->name << " " << info->addrs << dendl;
+ mon.clog->info() << "Standby " << info->human_name()
+ << " assigned to filesystem " << mds_map.fs_name
+ << " as rank " << rank;
+
+ fsmap.promote(info->global_id, fs.get_fscid(), rank);
+ do_propose = true;
+ }
+ }
+
+ if (mds_map.is_resizeable() && mds_map.allows_standby_replay()) {
+ // There were no failures to replace, so try using any available standbys
+ // as standby-replay daemons. Don't do this when the cluster is degraded
+ // as a standby-replay daemon may try to read a journal being migrated.
+ for (;;) {
+ auto info = fsmap.get_available_standby(fs);
+ if (!info) break;
+ dout(20) << "standby available mds." << info->global_id << dendl;
+ bool changed = false;
+ for (const auto& rank : mds_map.in) {
+ dout(20) << "examining " << rank << dendl;
+ if (mds_map.is_followable(rank)) {
+ dout(1) << " setting mds." << info->global_id
+ << " to follow mds rank " << rank << dendl;
+ fsmap.assign_standby_replay(info->global_id, fs.get_fscid(), rank);
+ do_propose = true;
+ changed = true;
+ break;
+ }
+ }
+ if (!changed) break;
+ }
+ }
+
+ return do_propose;
+}
+
+void MDSMonitor::tick()
+{
+ {
+ auto _history_prune_time = g_conf().get_val<std::chrono::seconds>("mon_fsmap_prune_threshold");
+ set_fsmap_history_threshold(_history_prune_time);
+ dout(20) << _history_prune_time << dendl;
+ prune_fsmap_history();
+ auto& history = get_fsmap_history();
+ auto now = real_clock::now();
+ if (auto it = history.begin(); it != history.end()) {
+ auto start = it->second.get_epoch();
+ dout(20) << "oldest epoch in history is " << start << dendl;
+ for (;;) {
+ --start;
+ bufferlist bl;
+ FSMap fsmaph;
+ int err = get_version(start, bl);
+ if (err == -ENOENT) {
+ break;
+ }
+ ceph_assert(err == 0);
+ ceph_assert(bl.length());
+ fsmaph.decode(bl);
+ auto btime = fsmaph.get_btime();
+ auto since = std::chrono::duration_cast<std::chrono::milliseconds>(now - btime);
+ dout(20) << "loaded epoch " << fsmaph.get_epoch() << " which is " << since << " old" << dendl;
+ if (since <= _history_prune_time) {
+ put_fsmap_history(fsmaph);
+ } else {
+ break;
+ }
+ }
+ }
+ }
+
+ if (!is_active() || !is_leader()) return;
+
+ auto &pending = get_pending_fsmap_writeable();
+
+ /* batch any changes to pending with any changes to osdmap */
+ paxos.plug();
+
+ bool do_propose = false;
+ bool propose_osdmap = false;
+
+ if (check_fsmap_struct_version) {
+ /* Allow time for trimming otherwise PaxosService::is_writeable will always
+ * be false.
+ */
+
+ auto now = clock::now();
+ auto elapsed = now - last_fsmap_struct_flush;
+ if (elapsed > std::chrono::seconds(30)) {
+ FSMap fsmap;
+ bufferlist bl;
+ auto v = get_first_committed();
+ int err = get_version(v, bl);
+ if (err) {
+ derr << "could not get version " << v << dendl;
+ ceph_abort();
+ }
+ try {
+ fsmap.decode(bl);
+ } catch (const ceph::buffer::malformed_input& e) {
+ dout(5) << "flushing old fsmap struct because unable to decode FSMap: " << e.what() << dendl;
+ }
+ /* N.B. FSMap::is_struct_old is also true for undecoded (failed to decode) FSMap */
+ if (fsmap.is_struct_old()) {
+ dout(5) << "fsmap struct is too old; proposing to flush out old versions" << dendl;
+ do_propose = true;
+ last_fsmap_struct_flush = now;
+ } else {
+ dout(20) << "struct is recent" << dendl;
+ check_fsmap_struct_version = false;
+ }
+ }
+ }
+
+ do_propose |= pending.check_health();
+
+ /* Check health and affinity of ranks */
+ do_propose |= check_health(pending, &propose_osdmap);
+
+ /* Resize the cluster according to max_mds. */
+ for ([[maybe_unused]] const auto& [fscid, fs] : std::as_const(pending)) {
+ do_propose |= maybe_resize_cluster(pending, fs);
+ }
+
+ /* Replace any failed ranks. */
+ for ([[maybe_unused]] auto& [fscid, fs] : std::as_const(pending)) {
+ do_propose |= maybe_promote_standby(pending, fs);
+ }
+
+ if (propose_osdmap) {
+ request_proposal(mon.osdmon());
+ }
+
+ /* allow MDSMonitor::propose_pending() to push the proposal through */
+ paxos.unplug();
+
+ if (do_propose) {
+ propose_pending();
+ }
+
+ last_tick = mono_clock::now();
+}
+
+MDSMonitor::MDSMonitor(Monitor &mn, Paxos &p, string service_name)
+ : PaxosService(mn, p, service_name)
+{
+ handlers = FileSystemCommandHandler::load(&p);
+}
+
+void MDSMonitor::on_restart()
+{
+ // Clear out the leader-specific state.
+ last_tick = mono_clock::now();
+ last_beacon.clear();
+}
+
diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h
new file mode 100644
index 000000000..b0f88cd31
--- /dev/null
+++ b/src/mon/MDSMonitor.h
@@ -0,0 +1,162 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/* Metadata Server Monitor
+ */
+
+#ifndef CEPH_MDSMONITOR_H
+#define CEPH_MDSMONITOR_H
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "include/types.h"
+#include "PaxosFSMap.h"
+#include "PaxosService.h"
+#include "msg/Messenger.h"
+#include "messages/MMDSBeacon.h"
+#include "CommandHandler.h"
+
+class FileSystemCommandHandler;
+
+class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHandler {
+ public:
+ using clock = ceph::coarse_mono_clock;
+ using time = ceph::coarse_mono_time;
+
+ MDSMonitor(Monitor &mn, Paxos &p, std::string service_name);
+
+ // service methods
+ void create_initial() override;
+ void get_store_prefixes(std::set<std::string>& s) const override;
+ void update_from_paxos(bool *need_bootstrap) override;
+ void init() override;
+ void create_pending() override;
+ void encode_pending(MonitorDBStore::TransactionRef t) override;
+ // we don't require full versions; don't encode any.
+ void encode_full(MonitorDBStore::TransactionRef t) override { }
+ version_t get_trim_to() const override;
+
+ bool preprocess_query(MonOpRequestRef op) override; // true if processed.
+ bool prepare_update(MonOpRequestRef op) override;
+ bool should_propose(double& delay) override;
+ bool has_health_warnings(std::vector<mds_metric_t> warnings);
+
+ bool should_print_status() const {
+ auto& fs = get_fsmap();
+ auto fs_count = fs.filesystem_count();
+ auto standby_count = fs.get_num_standby();
+ return fs_count > 0 || standby_count > 0;
+ }
+
+ void on_active() override;
+ void on_restart() override;
+
+ void check_subs();
+ void check_sub(Subscription *sub);
+
+ void dump_info(ceph::Formatter *f);
+ int print_nodes(ceph::Formatter *f);
+
+ /**
+ * Return true if a blocklist was done (i.e. OSD propose needed)
+ */
+ bool fail_mds_gid(FSMap &fsmap, mds_gid_t gid);
+
+ bool is_leader() const override { return mon.is_leader(); }
+
+ protected:
+ using mds_info_t = MDSMap::mds_info_t;
+
+ // my helpers
+ template<int dblV = 7>
+ void print_map(const FSMap &m);
+
+ void _updated(MonOpRequestRef op);
+
+ void _note_beacon(class MMDSBeacon *m);
+ bool preprocess_beacon(MonOpRequestRef op);
+ bool prepare_beacon(MonOpRequestRef op);
+
+ bool preprocess_offload_targets(MonOpRequestRef op);
+ bool prepare_offload_targets(MonOpRequestRef op);
+
+ int fail_mds(FSMap &fsmap, std::ostream &ss,
+ const std::string &arg, mds_info_t *failed_info);
+
+ bool preprocess_command(MonOpRequestRef op);
+ bool prepare_command(MonOpRequestRef op);
+
+ int filesystem_command(
+ FSMap &fsmap,
+ MonOpRequestRef op,
+ std::string const &prefix,
+ const cmdmap_t& cmdmap,
+ std::stringstream &ss);
+
+ // beacons
+ struct beacon_info_t {
+ ceph::mono_time stamp = ceph::mono_clock::zero();
+ uint64_t seq = 0;
+ beacon_info_t() {}
+ beacon_info_t(ceph::mono_time stamp, uint64_t seq) : stamp(stamp), seq(seq) {}
+ };
+ std::map<mds_gid_t, beacon_info_t> last_beacon;
+
+ std::list<std::shared_ptr<FileSystemCommandHandler> > handlers;
+
+ bool maybe_promote_standby(FSMap& fsmap, const Filesystem& fs);
+ bool maybe_resize_cluster(FSMap &fsmap, const Filesystem& fs);
+ bool drop_mds(FSMap &fsmap, mds_gid_t gid, const mds_info_t* rep_info, bool* osd_propose);
+ bool check_health(FSMap &fsmap, bool* osd_propose);
+ void tick() override; // check state, take actions
+
+ int dump_metadata(const FSMap &fsmap, const std::string &who, ceph::Formatter *f,
+ std::ostream& err);
+
+ void update_metadata(mds_gid_t gid, const Metadata& metadata);
+ void remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t);
+ int load_metadata(std::map<mds_gid_t, Metadata>& m);
+ void count_metadata(const std::string& field, ceph::Formatter *f);
+
+ void assign_quiesce_db_leader(FSMap &fsmap);
+
+public:
+ void print_fs_summary(std::ostream& out) {
+ get_fsmap().print_fs_summary(out);
+ }
+ void count_metadata(const std::string& field, std::map<std::string,int> *out);
+ void get_versions(std::map<std::string, std::list<std::string>> &versions);
+
+protected:
+ // MDS daemon GID to latest health state from that GID
+ std::map<uint64_t, MDSHealth> pending_daemon_health;
+ std::set<uint64_t> pending_daemon_health_rm;
+
+ std::map<mds_gid_t, Metadata> pending_metadata;
+
+ mds_gid_t gid_from_arg(const FSMap &fsmap, const std::string &arg, std::ostream& err);
+
+ // When did the mon last call into our tick() method? Used for detecting
+ // when the mon was not updating us for some period (e.g. during slow
+ // election) to reset last_beacon timeouts
+ ceph::mono_time last_tick = ceph::mono_clock::zero();
+
+private:
+ time last_fsmap_struct_flush = clock::zero();
+ bool check_fsmap_struct_version = true;
+};
+
+#endif
diff --git a/src/mon/MgrMap.h b/src/mon/MgrMap.h
new file mode 100644
index 000000000..82f6ea880
--- /dev/null
+++ b/src/mon/MgrMap.h
@@ -0,0 +1,693 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef MGR_MAP_H_
+#define MGR_MAP_H_
+
+#include <sstream>
+#include <set>
+
+#include "msg/msg_types.h"
+#include "include/encoding.h"
+#include "include/utime.h"
+#include "common/ceph_json.h"
+#include "common/Formatter.h"
+#include "common/ceph_releases.h"
+#include "common/version.h"
+#include "common/options.h"
+#include "common/Clock.h"
+
+
+class MgrMap
+{
+public:
+ struct ModuleOption {
+ std::string name;
+ uint8_t type = Option::TYPE_STR; // Option::type_t TYPE_*
+ uint8_t level = Option::LEVEL_ADVANCED; // Option::level_t LEVEL_*
+ uint32_t flags = 0; // Option::flag_t FLAG_*
+ std::string default_value;
+ std::string min, max;
+ std::set<std::string> enum_allowed;
+ std::string desc, long_desc;
+ std::set<std::string> tags;
+ std::set<std::string> see_also;
+
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(name, bl);
+ encode(type, bl);
+ encode(level, bl);
+ encode(flags, bl);
+ encode(default_value, bl);
+ encode(min, bl);
+ encode(max, bl);
+ encode(enum_allowed, bl);
+ encode(desc, bl);
+ encode(long_desc, bl);
+ encode(tags, bl);
+ encode(see_also, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ DECODE_START(1, p);
+ decode(name, p);
+ decode(type, p);
+ decode(level, p);
+ decode(flags, p);
+ decode(default_value, p);
+ decode(min, p);
+ decode(max, p);
+ decode(enum_allowed, p);
+ decode(desc, p);
+ decode(long_desc, p);
+ decode(tags, p);
+ decode(see_also, p);
+ DECODE_FINISH(p);
+ }
+ void dump(ceph::Formatter *f) const
+ {
+ f->dump_string("name", name);
+ f->dump_string("type", Option::type_to_str(
+ static_cast<Option::type_t>(type)));
+ f->dump_string("level", Option::level_to_str(
+ static_cast<Option::level_t>(level)));
+ f->dump_unsigned("flags", flags);
+ f->dump_string("default_value", default_value);
+ f->dump_string("min", min);
+ f->dump_string("max", max);
+ f->open_array_section("enum_allowed");
+ for (auto& i : enum_allowed) {
+ f->dump_string("value", i);
+ }
+ f->close_section();
+ f->dump_string("desc", desc);
+ f->dump_string("long_desc", long_desc);
+ f->open_array_section("tags");
+ for (auto& i : tags) {
+ f->dump_string("tag", i);
+ }
+ f->close_section();
+ f->open_array_section("see_also");
+ for (auto& i : see_also) {
+ f->dump_string("option", i);
+ }
+ f->close_section();
+ }
+ static void generate_test_instances(std::list<ModuleOption*>& ls)
+ {
+ ls.push_back(new ModuleOption);
+ ls.push_back(new ModuleOption);
+ ls.back()->name = "name";
+ ls.back()->type = Option::TYPE_STR;
+ ls.back()->level = Option::LEVEL_ADVANCED;
+ ls.back()->flags = Option::FLAG_RUNTIME;
+ ls.back()->default_value = "default_value";
+ ls.back()->min = "min";
+ ls.back()->max = "max";
+ ls.back()->enum_allowed.insert("enum_allowed");
+ ls.back()->desc = "desc";
+ ls.back()->long_desc = "long_desc";
+ ls.back()->tags.insert("tag");
+ ls.back()->see_also.insert("see_also");
+ }
+ };
+
+ class ModuleInfo
+ {
+ public:
+ std::string name;
+ bool can_run = true;
+ std::string error_string;
+ std::map<std::string,ModuleOption> module_options;
+
+ // We do not include the module's `failed` field in the beacon,
+ // because it is exposed via health checks.
+ void encode(ceph::buffer::list &bl) const {
+ ENCODE_START(2, 1, bl);
+ encode(name, bl);
+ encode(can_run, bl);
+ encode(error_string, bl);
+ encode(module_options, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(ceph::buffer::list::const_iterator &bl) {
+ DECODE_START(1, bl);
+ decode(name, bl);
+ decode(can_run, bl);
+ decode(error_string, bl);
+ if (struct_v >= 2) {
+ decode(module_options, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+
+ bool operator==(const ModuleInfo &rhs) const
+ {
+ return (name == rhs.name) && (can_run == rhs.can_run);
+ }
+
+ void dump(ceph::Formatter *f) const
+ {
+ f->open_object_section("module");
+ f->dump_string("name", name);
+ f->dump_bool("can_run", can_run);
+ f->dump_string("error_string", error_string);
+ f->open_object_section("module_options");
+ for (auto& i : module_options) {
+ f->dump_object(i.first.c_str(), i.second);
+ }
+ f->close_section();
+ f->close_section();
+ }
+
+ static void generate_test_instances(std::list<ModuleInfo*>& ls)
+ {
+ ls.push_back(new ModuleInfo);
+ ls.push_back(new ModuleInfo);
+ ls.back()->name = "name";
+ ls.back()->can_run = true;
+ ls.back()->error_string = "error_string";
+ ls.back()->module_options["module_option"] = ModuleOption();
+ }
+ };
+
+ class StandbyInfo
+ {
+ public:
+ uint64_t gid = 0;
+ std::string name;
+ std::vector<ModuleInfo> available_modules;
+ uint64_t mgr_features = 0;
+
+ StandbyInfo(uint64_t gid_, const std::string &name_,
+ const std::vector<ModuleInfo>& am,
+ uint64_t feat)
+ : gid(gid_), name(name_), available_modules(am),
+ mgr_features(feat)
+ {}
+
+ StandbyInfo() {}
+
+ void encode(ceph::buffer::list& bl) const
+ {
+ ENCODE_START(4, 1, bl);
+ encode(gid, bl);
+ encode(name, bl);
+ std::set<std::string> old_available_modules;
+ for (const auto &i : available_modules) {
+ old_available_modules.insert(i.name);
+ }
+ encode(old_available_modules, bl); // version 2
+ encode(available_modules, bl); // version 3
+ encode(mgr_features, bl); // v4
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(ceph::buffer::list::const_iterator& p)
+ {
+ DECODE_START(4, p);
+ decode(gid, p);
+ decode(name, p);
+ if (struct_v >= 2) {
+ std::set<std::string> old_available_modules;
+ decode(old_available_modules, p);
+ if (struct_v < 3) {
+ for (const auto &name : old_available_modules) {
+ MgrMap::ModuleInfo info;
+ info.name = name;
+ available_modules.push_back(std::move(info));
+ }
+ }
+ }
+ if (struct_v >= 3) {
+ decode(available_modules, p);
+ }
+ if (struct_v >= 4) {
+ decode(mgr_features, p);
+ }
+ DECODE_FINISH(p);
+ }
+ void dump(ceph::Formatter *f) const
+ {
+ f->dump_unsigned("gid", gid);
+ f->dump_string("name", name);
+ encode_json("available_modules", available_modules, f);
+ f->dump_unsigned("mgr_features", mgr_features);
+ }
+ static void generate_test_instances(std::list<StandbyInfo*>& ls)
+ {
+ ls.push_back(new StandbyInfo(1, "a", {}, 0));
+ ls.push_back(new StandbyInfo(2, "b", {}, 0));
+ ls.push_back(new StandbyInfo(3, "c", {}, 0));
+ }
+
+ bool have_module(const std::string &module_name) const
+ {
+ auto it = std::find_if(available_modules.begin(),
+ available_modules.end(),
+ [module_name](const ModuleInfo &m) -> bool {
+ return m.name == module_name;
+ });
+
+ return it != available_modules.end();
+ }
+ };
+
+ epoch_t epoch = 0;
+ epoch_t last_failure_osd_epoch = 0;
+
+
+ static const uint64_t FLAG_DOWN = (1<<0);
+ uint64_t flags = 0;
+
+ /// global_id of the ceph-mgr instance selected as a leader
+ uint64_t active_gid = 0;
+ /// server address reported by the leader once it is active
+ entity_addrvec_t active_addrs;
+ /// whether the nominated leader is active (i.e. has initialized its server)
+ bool available = false;
+ /// the name (foo in mgr.<foo>) of the active daemon
+ std::string active_name;
+ /// when the active mgr became active, or we lost the active mgr
+ utime_t active_change;
+ /// features
+ uint64_t active_mgr_features = 0;
+
+ std::multimap<std::string, entity_addrvec_t> clients; // for blocklist
+
+ std::map<uint64_t, StandbyInfo> standbys;
+
+ // Modules which are enabled
+ std::set<std::string> modules;
+
+ // Modules which should always be enabled. A manager daemon will enable
+ // modules from the union of this set and the `modules` set above, latest
+ // active version.
+ std::map<uint32_t, std::set<std::string>> always_on_modules;
+
+ // Modules which are reported to exist
+ std::vector<ModuleInfo> available_modules;
+
+ // Map of module name to URI, indicating services exposed by
+ // running modules on the active mgr daemon.
+ std::map<std::string, std::string> services;
+
+ static MgrMap create_null_mgrmap() {
+ MgrMap null_map;
+ /* Use the largest epoch so it's always bigger than whatever the mgr has. */
+ null_map.epoch = std::numeric_limits<decltype(epoch)>::max();
+ return null_map;
+ }
+
+ epoch_t get_epoch() const { return epoch; }
+ epoch_t get_last_failure_osd_epoch() const { return last_failure_osd_epoch; }
+ const entity_addrvec_t& get_active_addrs() const { return active_addrs; }
+ uint64_t get_active_gid() const { return active_gid; }
+ bool get_available() const { return available; }
+ const std::string &get_active_name() const { return active_name; }
+ const utime_t& get_active_change() const { return active_change; }
+ int get_num_standby() const { return standbys.size(); }
+
+ bool all_support_module(const std::string& module) {
+ if (!have_module(module)) {
+ return false;
+ }
+ for (auto& p : standbys) {
+ if (!p.second.have_module(module)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool have_module(const std::string &module_name) const
+ {
+ for (const auto &i : available_modules) {
+ if (i.name == module_name) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ const ModuleInfo *get_module_info(const std::string &module_name) const {
+ for (const auto &i : available_modules) {
+ if (i.name == module_name) {
+ return &i;
+ }
+ }
+ return nullptr;
+ }
+
+ bool can_run_module(const std::string &module_name, std::string *error) const
+ {
+ for (const auto &i : available_modules) {
+ if (i.name == module_name) {
+ *error = i.error_string;
+ return i.can_run;
+ }
+ }
+
+ std::ostringstream oss;
+ oss << "Module '" << module_name << "' does not exist";
+ throw std::logic_error(oss.str());
+ }
+
+ bool module_enabled(const std::string& module_name) const
+ {
+ return modules.find(module_name) != modules.end();
+ }
+
+ bool any_supports_module(const std::string& module) const {
+ if (have_module(module)) {
+ return true;
+ }
+ for (auto& p : standbys) {
+ if (p.second.have_module(module)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool have_name(const std::string& name) const {
+ if (active_name == name) {
+ return true;
+ }
+ for (auto& p : standbys) {
+ if (p.second.name == name) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ std::set<std::string> get_all_names() const {
+ std::set<std::string> ls;
+ if (active_name.size()) {
+ ls.insert(active_name);
+ }
+ for (auto& p : standbys) {
+ ls.insert(p.second.name);
+ }
+ return ls;
+ }
+
+ std::set<std::string> get_always_on_modules() const {
+ unsigned rnum = to_integer<uint32_t>(ceph_release());
+ auto it = always_on_modules.find(rnum);
+ if (it == always_on_modules.end()) {
+ // ok, try the most recent release
+ if (always_on_modules.empty()) {
+ return {}; // ugh
+ }
+ --it;
+ if (it->first < rnum) {
+ return it->second;
+ }
+ return {}; // wth
+ }
+ return it->second;
+ }
+
+ void encode(ceph::buffer::list& bl, uint64_t features) const
+ {
+ if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+ ENCODE_START(5, 1, bl);
+ encode(epoch, bl);
+ encode(active_addrs.legacy_addr(), bl, features);
+ encode(active_gid, bl);
+ encode(available, bl);
+ encode(active_name, bl);
+ encode(standbys, bl);
+ encode(modules, bl);
+
+ // Pre-version 4 std::string std::list of available modules
+ // (replaced by direct encode of ModuleInfo below)
+ std::set<std::string> old_available_modules;
+ for (const auto &i : available_modules) {
+ old_available_modules.insert(i.name);
+ }
+ encode(old_available_modules, bl);
+
+ encode(services, bl);
+ encode(available_modules, bl);
+ ENCODE_FINISH(bl);
+ return;
+ }
+ ENCODE_START(13, 6, bl);
+ encode(epoch, bl);
+ encode(active_addrs, bl, features);
+ encode(active_gid, bl);
+ encode(available, bl);
+ encode(active_name, bl);
+ encode(standbys, bl);
+ encode(modules, bl);
+ encode(services, bl);
+ encode(available_modules, bl);
+ encode(active_change, bl);
+ encode(always_on_modules, bl);
+ encode(active_mgr_features, bl);
+ encode(last_failure_osd_epoch, bl);
+ std::vector<std::string> clients_names;
+ std::vector<entity_addrvec_t> clients_addrs;
+ for (const auto& i : clients) {
+ clients_names.push_back(i.first);
+ clients_addrs.push_back(i.second);
+ }
+ // The address vector needs to be encoded first to produce a
+ // backwards compatible messsage for older monitors.
+ encode(clients_addrs, bl, features);
+ encode(clients_names, bl, features);
+ encode(flags, bl);
+ ENCODE_FINISH(bl);
+ return;
+ }
+
+ void decode(ceph::buffer::list::const_iterator& p)
+ {
+ DECODE_START(13, p);
+ decode(epoch, p);
+ decode(active_addrs, p);
+ decode(active_gid, p);
+ decode(available, p);
+ decode(active_name, p);
+ decode(standbys, p);
+ if (struct_v >= 2) {
+ decode(modules, p);
+
+ if (struct_v < 6) {
+ // Reconstitute ModuleInfos from names
+ std::set<std::string> module_name_list;
+ decode(module_name_list, p);
+ // Only need to unpack this field if we won't have the full
+ // MgrMap::ModuleInfo structures added in v4
+ if (struct_v < 4) {
+ for (const auto &i : module_name_list) {
+ MgrMap::ModuleInfo info;
+ info.name = i;
+ available_modules.push_back(std::move(info));
+ }
+ }
+ }
+ }
+ if (struct_v >= 3) {
+ decode(services, p);
+ }
+ if (struct_v >= 4) {
+ decode(available_modules, p);
+ }
+ if (struct_v >= 7) {
+ decode(active_change, p);
+ } else {
+ active_change = {};
+ }
+ if (struct_v >= 8) {
+ decode(always_on_modules, p);
+ }
+ if (struct_v >= 9) {
+ decode(active_mgr_features, p);
+ }
+ if (struct_v >= 10) {
+ decode(last_failure_osd_epoch, p);
+ }
+ if (struct_v >= 11) {
+ std::vector<entity_addrvec_t> clients_addrs;
+ decode(clients_addrs, p);
+ clients.clear();
+ if (struct_v >= 12) {
+ std::vector<std::string> clients_names;
+ decode(clients_names, p);
+ if (clients_names.size() != clients_addrs.size()) {
+ throw ceph::buffer::malformed_input(
+ "clients_names.size() != clients_addrs.size()");
+ }
+ auto cn = clients_names.begin();
+ auto ca = clients_addrs.begin();
+ for(; cn != clients_names.end(); ++cn, ++ca) {
+ clients.emplace(*cn, *ca);
+ }
+ } else {
+ for (const auto& i : clients_addrs) {
+ clients.emplace("", i);
+ }
+ }
+ }
+ if (struct_v >= 13) {
+ decode(flags, p);
+ }
+ DECODE_FINISH(p);
+ }
+
+ void dump(ceph::Formatter *f) const
+ {
+ f->dump_int("epoch", epoch);
+ f->dump_int("flags", flags);
+ f->dump_int("active_gid", get_active_gid());
+ f->dump_string("active_name", get_active_name());
+ f->dump_object("active_addrs", active_addrs);
+ f->dump_stream("active_addr") << active_addrs.get_legacy_str();
+ f->dump_stream("active_change") << active_change;
+ f->dump_unsigned("active_mgr_features", active_mgr_features);
+ f->dump_bool("available", available);
+ f->open_array_section("standbys");
+ for (const auto &i : standbys) {
+ f->open_object_section("standby");
+ f->dump_int("gid", i.second.gid);
+ f->dump_string("name", i.second.name);
+ f->dump_unsigned("mgr_features", i.second.mgr_features);
+ f->open_array_section("available_modules");
+ for (const auto& j : i.second.available_modules) {
+ j.dump(f);
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("modules");
+ for (auto& i : modules) {
+ f->dump_string("module", i);
+ }
+ f->close_section();
+ f->open_array_section("available_modules");
+ for (const auto& j : available_modules) {
+ j.dump(f);
+ }
+ f->close_section();
+
+ f->open_object_section("services");
+ for (const auto &i : services) {
+ f->dump_string(i.first.c_str(), i.second);
+ }
+ f->close_section();
+
+ f->open_object_section("always_on_modules");
+ for (auto& v : always_on_modules) {
+ f->open_array_section(ceph_release_name(v.first));
+ for (auto& m : v.second) {
+ f->dump_string("module", m);
+ }
+ f->close_section();
+ }
+ f->close_section(); // always_on_modules
+ f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch);
+ f->open_array_section("active_clients");
+ for (const auto& i : clients) {
+ f->open_object_section("client");
+ f->dump_string("name", i.first);
+ i.second.dump(f);
+ f->close_section();
+ }
+ f->close_section(); // active_clients
+ }
+
+ static void generate_test_instances(std::list<MgrMap*> &l)
+ {
+ l.push_back(new MgrMap);
+ }
+
+ void print_summary(ceph::Formatter *f, std::ostream *ss) const
+ {
+ // One or the other, not both
+ ceph_assert((ss != nullptr) != (f != nullptr));
+ if (f) {
+ f->dump_bool("available", available);
+ f->dump_int("num_standbys", standbys.size());
+ f->open_array_section("modules");
+ for (auto& i : modules) {
+ f->dump_string("module", i);
+ }
+ f->close_section();
+ f->open_object_section("services");
+ for (const auto &i : services) {
+ f->dump_string(i.first.c_str(), i.second);
+ }
+ f->close_section();
+ } else {
+ utime_t now = ceph_clock_now();
+ if (get_active_gid() != 0) {
+ *ss << get_active_name();
+ if (!available) {
+ // If the daemon hasn't gone active yet, indicate that.
+ *ss << "(active, starting";
+ } else {
+ *ss << "(active";
+ }
+ if (active_change) {
+ *ss << ", since " << utimespan_str(now - active_change);
+ }
+ *ss << ")";
+ } else {
+ *ss << "no daemons active";
+ if (active_change) {
+ *ss << " (since " << utimespan_str(now - active_change) << ")";
+ }
+ }
+ if (standbys.size()) {
+ *ss << ", standbys: ";
+ bool first = true;
+ for (const auto &i : standbys) {
+ if (!first) {
+ *ss << ", ";
+ }
+ *ss << i.second.name;
+ first = false;
+ }
+ }
+ }
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const MgrMap& m) {
+ std::ostringstream ss;
+ m.print_summary(nullptr, &ss);
+ return out << ss.str();
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const std::vector<ModuleInfo>& mi) {
+ for (const auto &i : mi) {
+ out << i.name << " ";
+ }
+ return out;
+ }
+};
+
+WRITE_CLASS_ENCODER_FEATURES(MgrMap)
+WRITE_CLASS_ENCODER(MgrMap::StandbyInfo)
+WRITE_CLASS_ENCODER(MgrMap::ModuleInfo);
+WRITE_CLASS_ENCODER(MgrMap::ModuleOption);
+
+#endif
+
diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc
new file mode 100644
index 000000000..3c3786572
--- /dev/null
+++ b/src/mon/MgrMonitor.cc
@@ -0,0 +1,1499 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include <boost/tokenizer.hpp>
+
+#include "messages/MMgrBeacon.h"
+#include "messages/MMgrMap.h"
+#include "messages/MMgrDigest.h"
+
+#include "include/stringify.h"
+#include "mgr/MgrContext.h"
+#include "mgr/mgr_commands.h"
+#include "OSDMonitor.h"
+#include "ConfigMonitor.h"
+#include "HealthMonitor.h"
+
+#include "common/TextTable.h"
+#include "include/stringify.h"
+
+#include "MgrMonitor.h"
+
+#define MGR_METADATA_PREFIX "mgr_metadata"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, map)
+using namespace TOPNSPC::common;
+
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::ErasureCodeInterfaceRef;
+using ceph::ErasureCodeProfile;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_message;
+using ceph::mono_clock;
+using ceph::mono_time;
+
+static ostream& _prefix(std::ostream *_dout, Monitor &mon,
+ const MgrMap& mgrmap) {
+ return *_dout << "mon." << mon.name << "@" << mon.rank
+ << "(" << mon.get_state_name()
+ << ").mgr e" << mgrmap.get_epoch() << " ";
+}
+
+// the system treats always_on_modules as if they provide built-in functionality
+// by ensuring that they are always enabled.
+const static std::map<uint32_t, std::set<std::string>> always_on_modules = {
+ {
+ CEPH_RELEASE_OCTOPUS, {
+ "crash",
+ "status",
+ "progress",
+ "balancer",
+ "devicehealth",
+ "orchestrator",
+ "rbd_support",
+ "volumes",
+ "pg_autoscaler",
+ "telemetry",
+ }
+ },
+ {
+ CEPH_RELEASE_PACIFIC, {
+ "crash",
+ "status",
+ "progress",
+ "balancer",
+ "devicehealth",
+ "orchestrator",
+ "rbd_support",
+ "volumes",
+ "pg_autoscaler",
+ "telemetry",
+ }
+ },
+ {
+ CEPH_RELEASE_QUINCY, {
+ "crash",
+ "status",
+ "progress",
+ "balancer",
+ "devicehealth",
+ "orchestrator",
+ "rbd_support",
+ "volumes",
+ "pg_autoscaler",
+ "telemetry",
+ }
+ },
+ {
+ CEPH_RELEASE_REEF, {
+ "crash",
+ "status",
+ "progress",
+ "balancer",
+ "devicehealth",
+ "orchestrator",
+ "rbd_support",
+ "volumes",
+ "pg_autoscaler",
+ "telemetry",
+ }
+ },
+ {
+ CEPH_RELEASE_SQUID, {
+ "crash",
+ "status",
+ "progress",
+ "balancer",
+ "devicehealth",
+ "orchestrator",
+ "rbd_support",
+ "volumes",
+ "pg_autoscaler",
+ "telemetry",
+ }
+ },
+};
+
+// Prefix for mon store of active mgr's command descriptions
+const static std::string command_descs_prefix = "mgr_command_descs";
+
+const Option *MgrMonitor::find_module_option(const string& name)
+{
+ // we have two forms of names: "mgr/$module/$option" and
+ // localized "mgr/$module/$instance/$option". normalize to the
+ // former by stripping out $instance.
+ string real_name;
+ if (name.substr(0, 4) != "mgr/") {
+ return nullptr;
+ }
+ auto second_slash = name.find('/', 5);
+ if (second_slash == std::string::npos) {
+ return nullptr;
+ }
+ auto third_slash = name.find('/', second_slash + 1);
+ if (third_slash != std::string::npos) {
+ // drop the $instance part between the second and third slash
+ real_name = name.substr(0, second_slash) + name.substr(third_slash);
+ } else {
+ real_name = name;
+ }
+ auto p = mgr_module_options.find(real_name);
+ if (p != mgr_module_options.end()) {
+ return &p->second;
+ }
+ return nullptr;
+}
+
+version_t MgrMonitor::get_trim_to() const
+{
+ int64_t max = g_conf().get_val<int64_t>("mon_max_mgrmap_epochs");
+ if (map.epoch > max) {
+ return map.epoch - max;
+ }
+ return 0;
+}
+
+void MgrMonitor::create_initial()
+{
+ // Take a local copy of initial_modules for tokenizer to iterate over.
+ auto initial_modules = g_conf().get_val<std::string>("mgr_initial_modules");
+ boost::tokenizer<> tok(initial_modules);
+ for (auto& m : tok) {
+ pending_map.modules.insert(m);
+ }
+ pending_map.always_on_modules = always_on_modules;
+ pending_command_descs = mgr_commands;
+ dout(10) << __func__ << " initial modules " << pending_map.modules
+ << ", always on modules " << pending_map.get_always_on_modules()
+ << ", " << pending_command_descs.size() << " commands"
+ << dendl;
+}
+
+void MgrMonitor::get_store_prefixes(std::set<string>& s) const
+{
+ s.insert(service_name);
+ s.insert(command_descs_prefix);
+ s.insert(MGR_METADATA_PREFIX);
+}
+
+void MgrMonitor::update_from_paxos(bool *need_bootstrap)
+{
+ version_t version = get_last_committed();
+ if (version != map.epoch) {
+ dout(4) << "loading version " << version << dendl;
+
+ bufferlist bl;
+ int err = get_version(version, bl);
+ ceph_assert(err == 0);
+
+ bool old_available = map.get_available();
+ uint64_t old_gid = map.get_active_gid();
+
+ auto p = bl.cbegin();
+ map.decode(p);
+
+ dout(4) << "active server: " << map.active_addrs
+ << "(" << map.active_gid << ")" << dendl;
+
+ ever_had_active_mgr = get_value("ever_had_active_mgr");
+
+ load_health();
+
+ if (map.available) {
+ first_seen_inactive = utime_t();
+ } else {
+ first_seen_inactive = ceph_clock_now();
+ }
+
+ check_subs();
+
+ if (version == 1
+ || command_descs.empty()
+ || (map.get_available()
+ && (!old_available || old_gid != map.get_active_gid()))) {
+ dout(4) << "mkfs or daemon transitioned to available, loading commands"
+ << dendl;
+ bufferlist loaded_commands;
+ int r = mon.store->get(command_descs_prefix, "", loaded_commands);
+ if (r < 0) {
+ derr << "Failed to load mgr commands: " << cpp_strerror(r) << dendl;
+ } else {
+ auto p = loaded_commands.cbegin();
+ decode(command_descs, p);
+ }
+ }
+ }
+
+ // populate module options
+ mgr_module_options.clear();
+ misc_option_strings.clear();
+ for (auto& i : map.available_modules) {
+ for (auto& j : i.module_options) {
+ string name = string("mgr/") + i.name + "/" + j.second.name;
+ auto p = mgr_module_options.emplace(
+ name,
+ Option(name, static_cast<Option::type_t>(j.second.type),
+ static_cast<Option::level_t>(j.second.level)));
+ Option& opt = p.first->second;
+ opt.set_flags(static_cast<Option::flag_t>(j.second.flags));
+ opt.set_flag(Option::FLAG_MGR);
+ opt.set_description(j.second.desc.c_str());
+ opt.set_long_description(j.second.long_desc.c_str());
+ for (auto& k : j.second.tags) {
+ opt.add_tag(k.c_str());
+ }
+ for (auto& k : j.second.see_also) {
+ if (i.module_options.count(k)) {
+ // it's another module option
+ misc_option_strings.push_back(string("mgr/") + i.name + "/" + k);
+ opt.add_see_also(misc_option_strings.back().c_str());
+ } else {
+ // it's a native option
+ opt.add_see_also(k.c_str());
+ }
+ }
+ Option::value_t v, v2;
+ std::string err;
+ if (j.second.default_value.size() &&
+ !opt.parse_value(j.second.default_value, &v, &err)) {
+ opt.set_default(v);
+ }
+ if (j.second.min.size() &&
+ j.second.max.size() &&
+ !opt.parse_value(j.second.min, &v, &err) &&
+ !opt.parse_value(j.second.max, &v2, &err)) {
+ opt.set_min_max(v, v2);
+ }
+ std::vector<const char *> enum_allowed;
+ for (auto& k : j.second.enum_allowed) {
+ enum_allowed.push_back(k.c_str());
+ }
+ opt.set_enum_allowed(enum_allowed);
+ }
+ }
+ // force ConfigMonitor to refresh, since it uses const Option *
+ // pointers into our mgr_module_options (which we just rebuilt).
+ mon.configmon()->load_config();
+
+ if (!mon.is_init()) {
+ // feed our pet MgrClient, unless we are in Monitor::[pre]init()
+ prime_mgr_client();
+ }
+}
+
+void MgrMonitor::prime_mgr_client()
+{
+ dout(10) << __func__ << dendl;
+ mon.mgr_client.ms_dispatch2(make_message<MMgrMap>(map));
+}
+
+void MgrMonitor::create_pending()
+{
+ pending_map = map;
+ pending_map.epoch++;
+}
+
+health_status_t MgrMonitor::should_warn_about_mgr_down()
+{
+ utime_t now = ceph_clock_now();
+ // we warn if we have osds AND we've exceeded the grace period
+ // which means a new mon cluster and be HEALTH_OK indefinitely as long as
+ // no OSDs are ever created.
+ if (mon.osdmon()->osdmap.get_num_osds() > 0 &&
+ now > mon.monmap->created + g_conf().get_val<int64_t>("mon_mgr_mkfs_grace")) {
+ health_status_t level = HEALTH_WARN;
+ if (first_seen_inactive != utime_t() &&
+ now - first_seen_inactive > g_conf().get_val<int64_t>("mon_mgr_inactive_grace")) {
+ level = HEALTH_ERR;
+ }
+ return level;
+ }
+ return HEALTH_OK;
+}
+
+void MgrMonitor::post_paxos_update()
+{
+ // are we handling digest subscribers?
+ if (digest_event) {
+ bool send = false;
+ if (prev_health_checks.empty()) {
+ prev_health_checks.resize(mon.paxos_service.size());
+ send = true;
+ }
+ ceph_assert(prev_health_checks.size() == mon.paxos_service.size());
+ for (auto i = 0u; i < prev_health_checks.size(); i++) {
+ const auto& curr = mon.paxos_service[i]->get_health_checks();
+ if (!send && curr != prev_health_checks[i]) {
+ send = true;
+ }
+ prev_health_checks[i] = curr;
+ }
+ if (send) {
+ if (is_active()) {
+ send_digests();
+ } else {
+ cancel_timer();
+ wait_for_active_ctx(new C_MonContext{&mon, [this](int) {
+ send_digests();
+ }});
+ }
+ }
+ }
+}
+
+void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+ dout(10) << __func__ << " " << pending_map << dendl;
+ bufferlist bl;
+ pending_map.encode(bl, mon.get_quorum_con_features());
+ put_version(t, pending_map.epoch, bl);
+ put_last_committed(t, pending_map.epoch);
+
+ for (auto& p : pending_metadata) {
+ dout(10) << __func__ << " set metadata for " << p.first << dendl;
+ t->put(MGR_METADATA_PREFIX, p.first, p.second);
+ }
+ for (auto& name : pending_metadata_rm) {
+ dout(10) << __func__ << " rm metadata for " << name << dendl;
+ t->erase(MGR_METADATA_PREFIX, name);
+ }
+ pending_metadata.clear();
+ pending_metadata_rm.clear();
+
+ health_check_map_t next;
+ if (pending_map.active_gid == 0) {
+ auto level = should_warn_about_mgr_down();
+ if (level != HEALTH_OK) {
+ next.add("MGR_DOWN", level, "no active mgr", 0);
+ } else {
+ dout(10) << __func__ << " no health warning (never active and new cluster)"
+ << dendl;
+ }
+ } else {
+ put_value(t, "ever_had_active_mgr", 1);
+ }
+ encode_health(next, t);
+
+ if (pending_command_descs.size()) {
+ dout(4) << __func__ << " encoding " << pending_command_descs.size()
+ << " command_descs" << dendl;
+ for (auto& p : pending_command_descs) {
+ p.set_flag(MonCommand::FLAG_MGR);
+ }
+ bufferlist bl;
+ encode(pending_command_descs, bl);
+ t->put(command_descs_prefix, "", bl);
+ pending_command_descs.clear();
+ }
+}
+
+bool MgrMonitor::check_caps(MonOpRequestRef op, const uuid_d& fsid)
+{
+ // check permissions
+ MonSession *session = op->get_session();
+ if (!session)
+ return false;
+ if (!session->is_capable("mgr", MON_CAP_X)) {
+ dout(1) << __func__ << " insufficient caps " << session->caps << dendl;
+ return false;
+ }
+ if (fsid != mon.monmap->fsid) {
+ dout(1) << __func__ << " op fsid " << fsid
+ << " != " << mon.monmap->fsid << dendl;
+ return false;
+ }
+ return true;
+}
+
+bool MgrMonitor::preprocess_query(MonOpRequestRef op)
+{
+ auto m = op->get_req<PaxosServiceMessage>();
+ switch (m->get_type()) {
+ case MSG_MGR_BEACON:
+ return preprocess_beacon(op);
+ case MSG_MON_COMMAND:
+ try {
+ return preprocess_command(op);
+ } catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
+
+ default:
+ mon.no_reply(op);
+ derr << "Unhandled message type " << m->get_type() << dendl;
+ return true;
+ }
+}
+
+bool MgrMonitor::prepare_update(MonOpRequestRef op)
+{
+ auto m = op->get_req<PaxosServiceMessage>();
+ switch (m->get_type()) {
+ case MSG_MGR_BEACON:
+ return prepare_beacon(op);
+
+ case MSG_MON_COMMAND:
+ try {
+ return prepare_command(op);
+ } catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return false; /* nothing to propose! */
+ }
+
+ default:
+ mon.no_reply(op);
+ derr << "Unhandled message type " << m->get_type() << dendl;
+ return false; /* nothing to propose! */
+ }
+}
+
+
+
+class C_Updated : public Context {
+ MgrMonitor *mm;
+ MonOpRequestRef op;
+public:
+ C_Updated(MgrMonitor *a, MonOpRequestRef c) :
+ mm(a), op(c) {}
+ void finish(int r) override {
+ if (r >= 0) {
+ // Success
+ } else if (r == -ECANCELED) {
+ mm->mon.no_reply(op);
+ } else {
+ mm->dispatch(op); // try again
+ }
+ }
+};
+
+bool MgrMonitor::preprocess_beacon(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMgrBeacon>();
+ mon.no_reply(op); // we never reply to beacons
+ dout(4) << "beacon from " << m->get_gid() << dendl;
+
+ if (!check_caps(op, m->get_fsid())) {
+ // drop it on the floor
+ return true;
+ }
+
+ // always send this to the leader's prepare_beacon()
+ return false;
+}
+
+bool MgrMonitor::prepare_beacon(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMgrBeacon>();
+ dout(4) << "beacon from " << m->get_gid() << dendl;
+
+ // Track whether we modified pending_map
+ bool updated = false;
+ bool plugged = false;
+
+ // See if we are seeing same name, new GID for the active daemon
+ if (m->get_name() == pending_map.active_name
+ && m->get_gid() != pending_map.active_gid)
+ {
+ dout(4) << "Active daemon restart (mgr." << m->get_name() << ")" << dendl;
+ mon.clog->info() << "Active manager daemon " << m->get_name()
+ << " restarted";
+ if (!mon.osdmon()->is_writeable()) {
+ dout(1) << __func__ << ": waiting for osdmon writeable to"
+ " blocklist old instance." << dendl;
+ mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return false;
+ }
+ plugged |= drop_active();
+ updated = true;
+ }
+
+ // See if we are seeing same name, new GID for any standbys
+ for (const auto &i : pending_map.standbys) {
+ const MgrMap::StandbyInfo &s = i.second;
+ if (s.name == m->get_name() && s.gid != m->get_gid()) {
+ dout(4) << "Standby daemon restart (mgr." << m->get_name() << ")" << dendl;
+ mon.clog->debug() << "Standby manager daemon " << m->get_name()
+ << " restarted";
+ drop_standby(i.first);
+ updated = true;
+ break;
+ }
+ }
+
+ last_beacon[m->get_gid()] = ceph::coarse_mono_clock::now();
+
+ if (pending_map.active_gid == m->get_gid()) {
+ if (pending_map.services != m->get_services()) {
+ dout(4) << "updated services from mgr." << m->get_name()
+ << ": " << m->get_services() << dendl;
+ pending_map.services = m->get_services();
+ updated = true;
+ }
+
+ // A beacon from the currently active daemon
+ if (pending_map.active_addrs != m->get_server_addrs()) {
+ dout(4) << "learned address " << m->get_server_addrs()
+ << " (was " << pending_map.active_addrs << ")" << dendl;
+ pending_map.active_addrs = m->get_server_addrs();
+ updated = true;
+ }
+
+ if (pending_map.get_available() != m->get_available()) {
+ dout(4) << "available " << m->get_gid() << dendl;
+ mon.clog->info() << "Manager daemon " << pending_map.active_name
+ << " is now available";
+
+ // This beacon should include command descriptions
+ pending_command_descs = m->get_command_descs();
+ if (pending_command_descs.empty()) {
+ // This should not happen, but it also isn't fatal: we just
+ // won't successfully update our list of commands.
+ dout(4) << "First available beacon from " << pending_map.active_name
+ << "(" << m->get_gid() << ") does not include command descs"
+ << dendl;
+ } else {
+ dout(4) << "First available beacon from " << pending_map.active_name
+ << "(" << m->get_gid() << ") includes "
+ << pending_command_descs.size() << " command descs" << dendl;
+ }
+
+ pending_map.available = m->get_available();
+ updated = true;
+ }
+ if (pending_map.available_modules != m->get_available_modules()) {
+ dout(4) << "available_modules " << m->get_available_modules()
+ << " (was " << pending_map.available_modules << ")" << dendl;
+ pending_map.available_modules = m->get_available_modules();
+ updated = true;
+ }
+ const auto& clients = m->get_clients();
+ if (pending_map.clients != clients) {
+ dout(4) << "active's RADOS clients " << clients
+ << " (was " << pending_map.clients << ")" << dendl;
+ pending_map.clients = clients;
+ updated = true;
+ }
+ } else if (m->get_available()) {
+ dout(4) << "mgr thinks it is active but it is not, dropping!" << dendl;
+ auto m = make_message<MMgrMap>(MgrMap::create_null_mgrmap());
+ mon.send_reply(op, m.detach());
+ goto out;
+ } else if (pending_map.active_gid == 0) {
+ // There is no currently active daemon, select this one.
+ if (pending_map.standbys.count(m->get_gid())) {
+ drop_standby(m->get_gid(), false);
+ }
+ if (!(pending_map.flags & MgrMap::FLAG_DOWN)) {
+ dout(4) << "selecting new active " << m->get_gid()
+ << " " << m->get_name()
+ << " (was " << pending_map.active_gid << " "
+ << pending_map.active_name << ")" << dendl;
+ pending_map.active_gid = m->get_gid();
+ pending_map.active_name = m->get_name();
+ pending_map.active_change = ceph_clock_now();
+ pending_map.active_mgr_features = m->get_mgr_features();
+ pending_map.available_modules = m->get_available_modules();
+ encode(m->get_metadata(), pending_metadata[m->get_name()]);
+ pending_metadata_rm.erase(m->get_name());
+
+ mon.clog->info() << "Activating manager daemon "
+ << pending_map.active_name;
+ updated = true;
+ }
+ } else {
+ if (pending_map.standbys.count(m->get_gid()) > 0) {
+ dout(10) << "from existing standby " << m->get_gid() << dendl;
+ if (pending_map.standbys[m->get_gid()].available_modules !=
+ m->get_available_modules()) {
+ dout(10) << "existing standby " << m->get_gid() << " available_modules "
+ << m->get_available_modules() << " (was "
+ << pending_map.standbys[m->get_gid()].available_modules << ")"
+ << dendl;
+ pending_map.standbys[m->get_gid()].available_modules =
+ m->get_available_modules();
+ updated = true;
+ }
+ } else {
+ dout(10) << "new standby " << m->get_gid() << dendl;
+ mon.clog->debug() << "Standby manager daemon " << m->get_name()
+ << " started";
+ pending_map.standbys[m->get_gid()] = {m->get_gid(), m->get_name(),
+ m->get_available_modules(),
+ m->get_mgr_features()};
+ encode(m->get_metadata(), pending_metadata[m->get_name()]);
+ pending_metadata_rm.erase(m->get_name());
+ updated = true;
+ }
+ }
+
+ if (updated) {
+ dout(4) << "updating map" << dendl;
+ wait_for_finished_proposal(op, new C_Updated(this, op));
+ } else {
+ dout(10) << "no change" << dendl;
+ }
+
+out:
+
+ if (plugged) {
+ paxos.unplug();
+ }
+
+ return updated;
+}
+
+void MgrMonitor::check_subs()
+{
+ const std::string type = "mgrmap";
+ if (mon.session_map.subs.count(type) == 0)
+ return;
+ for (auto sub : *(mon.session_map.subs[type])) {
+ check_sub(sub);
+ }
+}
+
+void MgrMonitor::check_sub(Subscription *sub)
+{
+ if (sub->type == "mgrmap") {
+ if (sub->next <= map.get_epoch()) {
+ dout(20) << "Sending map to subscriber " << sub->session->con
+ << " " << sub->session->con->get_peer_addr() << dendl;
+ sub->session->con->send_message2(make_message<MMgrMap>(map));
+ if (sub->onetime) {
+ mon.session_map.remove_sub(sub);
+ } else {
+ sub->next = map.get_epoch() + 1;
+ }
+ }
+ } else {
+ ceph_assert(sub->type == "mgrdigest");
+ if (sub->next == 0) {
+ // new registration; cancel previous timer
+ cancel_timer();
+ }
+ if (digest_event == nullptr) {
+ send_digests();
+ }
+ }
+}
+
+/**
+ * Handle digest subscriptions separately (outside of check_sub) because
+ * they are going to be periodic rather than version-driven.
+ */
+void MgrMonitor::send_digests()
+{
+ cancel_timer();
+
+ const std::string type = "mgrdigest";
+ if (mon.session_map.subs.count(type) == 0) {
+ prev_health_checks.clear();
+ return;
+ }
+
+ if (!is_active()) {
+ // if paxos is currently not active, don't send a digest but reenable timer
+ goto timer;
+ }
+ dout(10) << __func__ << dendl;
+
+ for (auto sub : *(mon.session_map.subs[type])) {
+ dout(10) << __func__ << " sending digest to subscriber " << sub->session->con
+ << " " << sub->session->con->get_peer_addr() << dendl;
+ auto mdigest = make_message<MMgrDigest>();
+
+ JSONFormatter f;
+ mon.healthmon()->get_health_status(true, &f, nullptr, nullptr, nullptr);
+ f.flush(mdigest->health_json);
+ f.reset();
+
+ mon.get_mon_status(&f);
+ f.flush(mdigest->mon_status_json);
+ f.reset();
+
+ sub->session->con->send_message2(mdigest);
+ }
+
+timer:
+ digest_event = mon.timer.add_event_after(
+ g_conf().get_val<int64_t>("mon_mgr_digest_period"),
+ new C_MonContext{&mon, [this](int) {
+ send_digests();
+ }});
+}
+
+void MgrMonitor::cancel_timer()
+{
+ if (digest_event) {
+ mon.timer.cancel_event(digest_event);
+ digest_event = nullptr;
+ }
+}
+
+void MgrMonitor::on_active()
+{
+ if (!mon.is_leader()) {
+ return;
+ }
+ mon.clog->debug() << "mgrmap e" << map.epoch << ": " << map;
+ assert(HAVE_FEATURE(mon.get_quorum_con_features(), SERVER_NAUTILUS));
+ if (pending_map.always_on_modules == always_on_modules) {
+ return;
+ }
+ dout(4) << "always on modules changed, pending "
+ << pending_map.always_on_modules << " != wanted "
+ << always_on_modules << dendl;
+ pending_map.always_on_modules = always_on_modules;
+ propose_pending();
+}
+
+void MgrMonitor::tick()
+{
+ if (!is_active() || !mon.is_leader())
+ return;
+
+ const auto now = ceph::coarse_mono_clock::now();
+
+ const auto mgr_beacon_grace =
+ g_conf().get_val<std::chrono::seconds>("mon_mgr_beacon_grace");
+
+ // Note that this is the mgr daemon's tick period, not ours (the
+ // beacon is sent with this period).
+ const auto mgr_tick_period =
+ g_conf().get_val<std::chrono::seconds>("mgr_tick_period");
+
+ if (last_tick != ceph::coarse_mono_clock::zero()
+ && (now - last_tick > (mgr_beacon_grace - mgr_tick_period))) {
+ // This case handles either local slowness (calls being delayed
+ // for whatever reason) or cluster election slowness (a long gap
+ // between calls while an election happened)
+ dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
+ "(slow election?) of " << now - last_tick << " seconds" << dendl;
+ for (auto &i : last_beacon) {
+ i.second = now;
+ }
+ }
+
+ last_tick = now;
+
+ // Populate any missing beacons (i.e. no beacon since MgrMonitor
+ // instantiation) with the current time, so that they will
+ // eventually look laggy if they fail to give us a beacon.
+ if (pending_map.active_gid != 0
+ && last_beacon.count(pending_map.active_gid) == 0) {
+ last_beacon[pending_map.active_gid] = now;
+ }
+ for (auto s : pending_map.standbys) {
+ if (last_beacon.count(s.first) == 0) {
+ last_beacon[s.first] = now;
+ }
+ }
+
+ // Cull standbys first so that any remaining standbys
+ // will be eligible to take over from the active if we cull him.
+ std::list<uint64_t> dead_standbys;
+ const auto cutoff = now - mgr_beacon_grace;
+ for (const auto &i : pending_map.standbys) {
+ auto last_beacon_time = last_beacon.at(i.first);
+ if (last_beacon_time < cutoff) {
+ dead_standbys.push_back(i.first);
+ }
+ }
+
+ bool propose = false;
+ bool plugged = false;
+
+ for (auto i : dead_standbys) {
+ dout(4) << "Dropping laggy standby " << i << dendl;
+ drop_standby(i);
+ propose = true;
+ }
+
+ if (pending_map.active_gid != 0
+ && last_beacon.at(pending_map.active_gid) < cutoff
+ && mon.osdmon()->is_writeable()) {
+ const std::string old_active_name = pending_map.active_name;
+ plugged |= drop_active();
+ propose = true;
+ dout(4) << "Dropping active" << pending_map.active_gid << dendl;
+ if (promote_standby()) {
+ dout(4) << "Promoted standby " << pending_map.active_gid << dendl;
+ mon.clog->info() << "Manager daemon " << old_active_name
+ << " is unresponsive, replacing it with standby"
+ << " daemon " << pending_map.active_name;
+ } else {
+ dout(4) << "Active is laggy but have no standbys to replace it" << dendl;
+ mon.clog->info() << "Manager daemon " << old_active_name
+ << " is unresponsive. No standby daemons available.";
+ }
+ } else if (pending_map.active_gid == 0) {
+ if (promote_standby()) {
+ dout(4) << "Promoted standby " << pending_map.active_gid << dendl;
+ mon.clog->info() << "Activating manager daemon "
+ << pending_map.active_name;
+ propose = true;
+ }
+ }
+
+ if (!pending_map.available &&
+ !ever_had_active_mgr &&
+ should_warn_about_mgr_down() != HEALTH_OK) {
+ dout(10) << " exceeded mon_mgr_mkfs_grace "
+ << g_conf().get_val<int64_t>("mon_mgr_mkfs_grace")
+ << " seconds" << dendl;
+ propose = true;
+ }
+
+ // obsolete modules?
+ if (mon.monmap->min_mon_release >= ceph_release_t::octopus &&
+ pending_map.module_enabled("orchestrator_cli")) {
+ dout(10) << " disabling obsolete/renamed 'orchestrator_cli'" << dendl;
+ // we don't need to enable 'orchestrator' because it's now always-on
+ pending_map.modules.erase("orchestrator_cli");
+ propose = true;
+ }
+
+ if (propose) {
+ propose_pending();
+ }
+ if (plugged) {
+ paxos.unplug();
+ ceph_assert(propose);
+ paxos.trigger_propose();
+ }
+}
+
+void MgrMonitor::on_restart()
+{
+ // Clear out the leader-specific state.
+ last_beacon.clear();
+ last_tick = ceph::coarse_mono_clock::now();
+}
+
+
+bool MgrMonitor::promote_standby()
+{
+ ceph_assert(pending_map.active_gid == 0);
+ if (pending_map.flags & MgrMap::FLAG_DOWN) {
+ return false;
+ }
+ if (pending_map.standbys.size()) {
+ // Promote a replacement (arbitrary choice of standby)
+ auto replacement_gid = pending_map.standbys.begin()->first;
+ pending_map.active_gid = replacement_gid;
+ pending_map.active_name = pending_map.standbys.at(replacement_gid).name;
+ pending_map.available_modules =
+ pending_map.standbys.at(replacement_gid).available_modules;
+ pending_map.active_mgr_features =
+ pending_map.standbys.at(replacement_gid).mgr_features;
+ pending_map.available = false;
+ pending_map.active_addrs = entity_addrvec_t();
+ pending_map.active_change = ceph_clock_now();
+
+ mon.clog->info() << "Activating manager daemon "
+ << pending_map.active_name;
+
+ drop_standby(replacement_gid, false);
+
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool MgrMonitor::drop_active()
+{
+ ceph_assert(mon.osdmon()->is_writeable());
+
+ bool plugged = false;
+ if (!paxos.is_plugged()) {
+ paxos.plug();
+ plugged = true;
+ }
+
+ if (last_beacon.count(pending_map.active_gid) > 0) {
+ last_beacon.erase(pending_map.active_gid);
+ }
+
+ ceph_assert(pending_map.active_gid > 0);
+ auto until = ceph_clock_now();
+ until += g_conf().get_val<double>("mon_mgr_blocklist_interval");
+ dout(5) << "blocklisting previous mgr." << pending_map.active_name << "."
+ << pending_map.active_gid << " ("
+ << pending_map.active_addrs << ")" << dendl;
+ auto blocklist_epoch = mon.osdmon()->blocklist(pending_map.active_addrs, until);
+
+ /* blocklist RADOS clients in use by the mgr */
+ for (const auto& a : pending_map.clients) {
+ mon.osdmon()->blocklist(a.second, until);
+ }
+ request_proposal(mon.osdmon());
+
+ pending_metadata_rm.insert(pending_map.active_name);
+ pending_metadata.erase(pending_map.active_name);
+ pending_map.active_name = "";
+ pending_map.active_gid = 0;
+ pending_map.active_change = ceph_clock_now();
+ pending_map.active_mgr_features = 0;
+ pending_map.available = false;
+ pending_map.active_addrs = entity_addrvec_t();
+ pending_map.services.clear();
+ pending_map.clients.clear();
+ pending_map.last_failure_osd_epoch = blocklist_epoch;
+
+ /* If we are dropping the active, we need to notify clients immediately.
+ * Additionally, avoid logical races with ::prepare_beacon which cannot
+ * accurately determine if a mgr is a standby or an old active.
+ */
+ force_immediate_propose();
+
+ // So that when new active mgr subscribes to mgrdigest, it will
+ // get an immediate response instead of waiting for next timer
+ cancel_timer();
+ return plugged;
+}
+
+void MgrMonitor::drop_standby(uint64_t gid, bool drop_meta)
+{
+ if (drop_meta) {
+ pending_metadata_rm.insert(pending_map.standbys[gid].name);
+ pending_metadata.erase(pending_map.standbys[gid].name);
+ }
+ pending_map.standbys.erase(gid);
+ if (last_beacon.count(gid) > 0) {
+ last_beacon.erase(gid);
+ }
+}
+
+bool MgrMonitor::preprocess_command(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonCommand>();
+ std::stringstream ss;
+ bufferlist rdata;
+
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+ return true;
+ }
+
+ MonSession *session = op->get_session();
+ if (!session) {
+ mon.reply_command(op, -EACCES, "access denied", rdata,
+ get_last_committed());
+ return true;
+ }
+
+ string format = cmd_getval_or<string>(cmdmap, "format", "plain");
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+ string prefix;
+ cmd_getval(cmdmap, "prefix", prefix);
+ int r = 0;
+
+ if (prefix == "mgr stat") {
+ if (!f) {
+ f.reset(Formatter::create(format, "json-pretty", "json-pretty"));
+ }
+ f->open_object_section("stat");
+ f->dump_unsigned("epoch", map.get_epoch());
+ f->dump_bool("available", map.get_available());
+ f->dump_string("active_name", map.get_active_name());
+ f->dump_unsigned("num_standby", map.get_num_standby());
+ f->close_section();
+ f->flush(rdata);
+ } else if (prefix == "mgr dump") {
+ if (!f) {
+ f.reset(Formatter::create(format, "json-pretty", "json-pretty"));
+ }
+ int64_t epoch = cmd_getval_or<int64_t>(cmdmap, "epoch", map.get_epoch());
+ if (epoch == (int64_t)map.get_epoch()) {
+ f->dump_object("mgrmap", map);
+ } else {
+ bufferlist bl;
+ int err = get_version(epoch, bl);
+ if (err == -ENOENT) {
+ r = -ENOENT;
+ ss << "there is no map for epoch " << epoch;
+ goto reply;
+ }
+ MgrMap m;
+ auto p = bl.cbegin();
+ m.decode(p);
+ f->dump_object("mgrmap", m);
+ }
+ f->flush(rdata);
+ } else if (prefix == "mgr module ls") {
+ if (f) {
+ f->open_object_section("modules");
+ {
+ f->open_array_section("always_on_modules");
+ for (auto& p : map.get_always_on_modules()) {
+ f->dump_string("module", p);
+ }
+ f->close_section();
+ f->open_array_section("enabled_modules");
+ for (auto& p : map.modules) {
+ if (map.get_always_on_modules().count(p) > 0)
+ continue;
+ // We only show the name for enabled modules. The any errors
+ // etc will show up as a health checks.
+ f->dump_string("module", p);
+ }
+ f->close_section();
+ f->open_array_section("disabled_modules");
+ for (auto& p : map.available_modules) {
+ if (map.modules.count(p.name) == 0 &&
+ map.get_always_on_modules().count(p.name) == 0) {
+ // For disabled modules, we show the full info if the detail
+ // parameter is enabled, to give a hint about whether enabling it will work
+ p.dump(f.get());
+ }
+ }
+ f->close_section();
+ }
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ TextTable tbl;
+ tbl.define_column("MODULE", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column(" ", TextTable::LEFT, TextTable::LEFT);
+
+ for (auto& p : map.get_always_on_modules()) {
+ tbl << p;
+ tbl << "on (always on)";
+ tbl << TextTable::endrow;
+ }
+ for (auto& p : map.modules) {
+ if (map.get_always_on_modules().count(p) > 0)
+ continue;
+ tbl << p;
+ tbl << "on";
+ tbl << TextTable::endrow;
+ }
+ for (auto& p : map.available_modules) {
+ if (map.modules.count(p.name) == 0 &&
+ map.get_always_on_modules().count(p.name) == 0) {
+ tbl << p.name;
+ tbl << "-";
+ tbl << TextTable::endrow;
+ }
+ }
+ rdata.append(stringify(tbl));
+ }
+ } else if (prefix == "mgr services") {
+ if (!f) {
+ f.reset(Formatter::create(format, "json-pretty", "json-pretty"));
+ }
+ f->open_object_section("services");
+ for (const auto &i : map.services) {
+ f->dump_string(i.first.c_str(), i.second);
+ }
+ f->close_section();
+ f->flush(rdata);
+ } else if (prefix == "mgr metadata") {
+ if (!f) {
+ f.reset(Formatter::create(format, "json-pretty", "json-pretty"));
+ }
+ string name;
+ cmd_getval(cmdmap, "who", name);
+ if (name.size() > 0 && !map.have_name(name)) {
+ ss << "mgr." << name << " does not exist";
+ r = -ENOENT;
+ goto reply;
+ }
+ if (name.size()) {
+ f->open_object_section("mgr_metadata");
+ f->dump_string("name", name);
+ r = dump_metadata(name, f.get(), &ss);
+ if (r < 0)
+ goto reply;
+ f->close_section();
+ } else {
+ r = 0;
+ f->open_array_section("mgr_metadata");
+ for (auto& i : map.get_all_names()) {
+ f->open_object_section("mgr");
+ f->dump_string("name", i);
+ r = dump_metadata(i, f.get(), NULL);
+ if (r == -EINVAL || r == -ENOENT) {
+ // Drop error, continue to get other daemons' metadata
+ dout(4) << "No metadata for mgr." << i << dendl;
+ r = 0;
+ } else if (r < 0) {
+ // Unexpected error
+ goto reply;
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->flush(rdata);
+ } else if (prefix == "mgr versions") {
+ if (!f) {
+ f.reset(Formatter::create(format, "json-pretty", "json-pretty"));
+ }
+ count_metadata("ceph_version", f.get());
+ f->flush(rdata);
+ r = 0;
+ } else if (prefix == "mgr count-metadata") {
+ if (!f) {
+ f.reset(Formatter::create(format, "json-pretty", "json-pretty"));
+ }
+ string field;
+ cmd_getval(cmdmap, "property", field);
+ count_metadata(field, f.get());
+ f->flush(rdata);
+ r = 0;
+ } else {
+ return false;
+ }
+
+reply:
+ string rs;
+ getline(ss, rs);
+ mon.reply_command(op, r, rs, rdata, get_last_committed());
+ return true;
+}
+
+bool MgrMonitor::prepare_command(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonCommand>();
+
+ std::stringstream ss;
+ bufferlist rdata;
+
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+ return true;
+ }
+
+ MonSession *session = op->get_session();
+ if (!session) {
+ mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
+ return true;
+ }
+
+ string format = cmd_getval_or<string>(cmdmap, "format", "plain");
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+ const auto prefix = cmd_getval_or<string>(cmdmap, "prefix", string{});
+ int r = 0;
+ bool plugged = false;
+
+ if (prefix == "mgr set") {
+ std::string var;
+ if (!cmd_getval(cmdmap, "var", var) || var.empty()) {
+ ss << "Invalid variable";
+ return -EINVAL;
+ }
+ string val;
+ if (!cmd_getval(cmdmap, "val", val)) {
+ return -EINVAL;
+ }
+
+ if (var == "down") {
+ bool enable_down = false;
+ int r = parse_bool(val, &enable_down, ss);
+ if (r != 0) {
+ return r;
+ }
+ if (enable_down) {
+ if (!mon.osdmon()->is_writeable()) {
+ mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return false;
+ }
+ pending_map.flags |= MgrMap::FLAG_DOWN;
+ plugged |= drop_active();
+ } else {
+ pending_map.flags &= ~(MgrMap::FLAG_DOWN);
+ }
+ } else {
+ return -EINVAL;
+ }
+ } else if (prefix == "mgr fail") {
+ string who;
+ if (!cmd_getval(cmdmap, "who", who)) {
+ if (!map.active_gid) {
+ ss << "Currently no active mgr";
+ goto out;
+ }
+ who = map.active_name;
+ }
+
+ std::string err;
+ uint64_t gid = strict_strtol(who.c_str(), 10, &err);
+ bool changed = false;
+ if (!err.empty()) {
+ // Does not parse as a gid, treat it as a name
+ if (pending_map.active_name == who) {
+ if (!mon.osdmon()->is_writeable()) {
+ mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return false;
+ }
+ plugged |= drop_active();
+ changed = true;
+ } else {
+ gid = 0;
+ for (const auto &i : pending_map.standbys) {
+ if (i.second.name == who) {
+ gid = i.first;
+ break;
+ }
+ }
+ if (gid != 0) {
+ drop_standby(gid);
+ changed = true;
+ } else {
+ ss << "Daemon not found '" << who << "', already failed?";
+ }
+ }
+ } else {
+ if (pending_map.active_gid == gid) {
+ if (!mon.osdmon()->is_writeable()) {
+ mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return false;
+ }
+ plugged |= drop_active();
+ changed = true;
+ } else if (pending_map.standbys.count(gid) > 0) {
+ drop_standby(gid);
+ changed = true;
+ } else {
+ ss << "Daemon not found '" << gid << "', already failed?";
+ }
+ }
+
+ if (changed && pending_map.active_gid == 0) {
+ promote_standby();
+ }
+ } else if (prefix == "mgr module enable") {
+ string module;
+ cmd_getval(cmdmap, "module", module);
+ if (module.empty()) {
+ r = -EINVAL;
+ goto out;
+ }
+ if (pending_map.get_always_on_modules().count(module) > 0) {
+ ss << "module '" << module << "' is already enabled (always-on)";
+ goto out;
+ }
+ bool force = false;
+ cmd_getval_compat_cephbool(cmdmap, "force", force);
+ if (!pending_map.all_support_module(module) &&
+ !force) {
+ ss << "all mgr daemons do not support module '" << module << "', pass "
+ << "--force to force enablement";
+ r = -ENOENT;
+ goto out;
+ }
+
+ std::string can_run_error;
+ if (!force && !pending_map.can_run_module(module, &can_run_error)) {
+ ss << "module '" << module << "' reports that it cannot run on the active "
+ "manager daemon: " << can_run_error << " (pass --force to force "
+ "enablement)";
+ r = -ENOENT;
+ goto out;
+ }
+
+ if (pending_map.module_enabled(module)) {
+ ss << "module '" << module << "' is already enabled";
+ r = 0;
+ goto out;
+ }
+ pending_map.modules.insert(module);
+ } else if (prefix == "mgr module disable") {
+ string module;
+ cmd_getval(cmdmap, "module", module);
+ if (module.empty()) {
+ r = -EINVAL;
+ goto out;
+ }
+ if (pending_map.get_always_on_modules().count(module) > 0) {
+ ss << "module '" << module << "' cannot be disabled (always-on)";
+ r = -EINVAL;
+ goto out;
+ }
+ if (!pending_map.module_enabled(module)) {
+ ss << "module '" << module << "' is already disabled";
+ r = 0;
+ goto out;
+ }
+ if (!pending_map.modules.count(module)) {
+ ss << "module '" << module << "' is not enabled";
+ }
+ pending_map.modules.erase(module);
+ } else {
+ ss << "Command '" << prefix << "' not implemented!";
+ r = -ENOSYS;
+ }
+
+out:
+ dout(4) << __func__ << " done, r=" << r << dendl;
+ /* Compose response */
+ string rs;
+ getline(ss, rs);
+
+ if (r >= 0) {
+ // success.. delay reply
+ wait_for_commit(op, new Monitor::C_Command(mon, op, r, rs,
+ get_last_committed() + 1));
+ } else {
+ // reply immediately
+ mon.reply_command(op, r, rs, rdata, get_last_committed());
+ }
+
+ if (plugged) {
+ paxos.unplug();
+ }
+
+ return r >= 0;
+}
+
+void MgrMonitor::init()
+{
+ if (digest_event == nullptr) {
+ send_digests(); // To get it to schedule its own event
+ }
+}
+
+void MgrMonitor::on_shutdown()
+{
+ cancel_timer();
+}
+
+int MgrMonitor::load_metadata(const string& name, std::map<string, string>& m,
+ ostream *err) const
+{
+ bufferlist bl;
+ int r = mon.store->get(MGR_METADATA_PREFIX, name, bl);
+ if (r < 0)
+ return r;
+ try {
+ auto p = bl.cbegin();
+ decode(m, p);
+ }
+ catch (ceph::buffer::error& e) {
+ if (err)
+ *err << "mgr." << name << " metadata is corrupt";
+ return -EIO;
+ }
+ return 0;
+}
+
+void MgrMonitor::count_metadata(const string& field, std::map<string,int> *out)
+{
+ std::set<string> ls = map.get_all_names();
+ for (auto& name : ls) {
+ std::map<string,string> meta;
+ load_metadata(name, meta, nullptr);
+ auto p = meta.find(field);
+ if (p == meta.end()) {
+ (*out)["unknown"]++;
+ } else {
+ (*out)[p->second]++;
+ }
+ }
+}
+
+void MgrMonitor::count_metadata(const string& field, Formatter *f)
+{
+ std::map<string,int> by_val;
+ count_metadata(field, &by_val);
+ f->open_object_section(field.c_str());
+ for (auto& p : by_val) {
+ f->dump_int(p.first.c_str(), p.second);
+ }
+ f->close_section();
+}
+
+void MgrMonitor::get_versions(std::map<string, list<string> > &versions)
+{
+ std::set<string> ls = map.get_all_names();
+ for (auto& name : ls) {
+ std::map<string,string> meta;
+ load_metadata(name, meta, nullptr);
+ auto p = meta.find("ceph_version_short");
+ if (p == meta.end()) continue;
+ versions[p->second].push_back(string("mgr.") + name);
+ }
+}
+
+int MgrMonitor::dump_metadata(const string& name, Formatter *f, ostream *err)
+{
+ std::map<string,string> m;
+ if (int r = load_metadata(name, m, err))
+ return r;
+ for (auto& p : m) {
+ f->dump_string(p.first.c_str(), p.second);
+ }
+ return 0;
+}
+
+void MgrMonitor::print_nodes(Formatter *f) const
+{
+ ceph_assert(f);
+
+ std::map<string, list<string> > mgrs; // hostname => mgr
+ auto ls = map.get_all_names();
+ for (auto& name : ls) {
+ std::map<string,string> meta;
+ if (load_metadata(name, meta, nullptr)) {
+ continue;
+ }
+ auto hostname = meta.find("hostname");
+ if (hostname == meta.end()) {
+ // not likely though
+ continue;
+ }
+ mgrs[hostname->second].push_back(name);
+ }
+
+ dump_services(f, mgrs, "mgr");
+}
+
+const std::vector<MonCommand> &MgrMonitor::get_command_descs() const
+{
+ if (command_descs.empty()) {
+ // must have just upgraded; fallback to static commands
+ return mgr_commands;
+ } else {
+ return command_descs;
+ }
+}
diff --git a/src/mon/MgrMonitor.h b/src/mon/MgrMonitor.h
new file mode 100644
index 000000000..a2a84c141
--- /dev/null
+++ b/src/mon/MgrMonitor.h
@@ -0,0 +1,147 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef CEPH_MGRMONITOR_H
+#define CEPH_MGRMONITOR_H
+
+#include <map>
+#include <set>
+
+#include "include/Context.h"
+#include "MgrMap.h"
+#include "PaxosService.h"
+#include "MonCommand.h"
+#include "CommandHandler.h"
+
+class MgrMonitor: public PaxosService, public CommandHandler
+{
+ MgrMap map;
+ MgrMap pending_map;
+ bool ever_had_active_mgr = false;
+
+ std::map<std::string, ceph::buffer::list> pending_metadata;
+ std::set<std::string> pending_metadata_rm;
+
+ std::map<std::string,Option> mgr_module_options;
+ std::list<std::string> misc_option_strings;
+
+ utime_t first_seen_inactive;
+
+ std::map<uint64_t, ceph::coarse_mono_clock::time_point> last_beacon;
+
+ /**
+ * If a standby is available, make it active, given that
+ * there is currently no active daemon.
+ *
+ * @return true if a standby was promoted
+ */
+ bool promote_standby();
+
+ /**
+ * Drop the active daemon from the MgrMap. No promotion is performed.
+ *
+ * @return whether PAXOS was plugged by this method
+ */
+ bool drop_active();
+
+ /**
+ * Remove this gid from the list of standbys. By default,
+ * also remove metadata (i.e. forget the daemon entirely).
+ *
+ * Set `drop_meta` to false if you would like to keep
+ * the daemon's metadata, for example if you're dropping
+ * it as a standby before reinstating it as the active daemon.
+ */
+ void drop_standby(uint64_t gid, bool drop_meta=true);
+
+ Context *digest_event = nullptr;
+ void cancel_timer();
+
+ std::vector<health_check_map_t> prev_health_checks;
+
+ bool check_caps(MonOpRequestRef op, const uuid_d& fsid);
+
+ health_status_t should_warn_about_mgr_down();
+
+ // Command descriptions we've learned from the active mgr
+ std::vector<MonCommand> command_descs;
+ std::vector<MonCommand> pending_command_descs;
+
+public:
+ MgrMonitor(Monitor &mn, Paxos &p, const std::string& service_name)
+ : PaxosService(mn, p, service_name)
+ {}
+ ~MgrMonitor() override {}
+
+ void init() override;
+ void on_shutdown() override;
+
+ const MgrMap &get_map() const { return map; }
+
+ const std::map<std::string,Option>& get_mgr_module_options() {
+ return mgr_module_options;
+ }
+ const Option *find_module_option(const std::string& name);
+
+ bool in_use() const { return map.epoch > 0; }
+
+ version_t get_trim_to() const override;
+
+ void prime_mgr_client();
+
+ void create_initial() override;
+ void get_store_prefixes(std::set<std::string>& s) const override;
+ void update_from_paxos(bool *need_bootstrap) override;
+ void post_paxos_update() override;
+ void create_pending() override;
+ void encode_pending(MonitorDBStore::TransactionRef t) override;
+
+ bool preprocess_query(MonOpRequestRef op) override;
+ bool prepare_update(MonOpRequestRef op) override;
+
+ bool preprocess_command(MonOpRequestRef op);
+ bool prepare_command(MonOpRequestRef op);
+
+ void encode_full(MonitorDBStore::TransactionRef t) override { }
+
+ bool preprocess_beacon(MonOpRequestRef op);
+ bool prepare_beacon(MonOpRequestRef op);
+
+ void check_sub(Subscription *sub);
+ void check_subs();
+ void send_digests();
+
+ void on_active() override;
+ void on_restart() override;
+
+ void tick() override;
+
+ void print_summary(ceph::Formatter *f, std::ostream *ss) const;
+
+ const std::vector<MonCommand> &get_command_descs() const;
+
+ int load_metadata(const std::string& name, std::map<std::string, std::string>& m,
+ std::ostream *err) const;
+ int dump_metadata(const std::string& name, ceph::Formatter *f, std::ostream *err);
+ void print_nodes(ceph::Formatter *f) const;
+ void count_metadata(const std::string& field, ceph::Formatter *f);
+ void count_metadata(const std::string& field, std::map<std::string,int> *out);
+ void get_versions(std::map<std::string, std::list<std::string>> &versions);
+
+ // When did the mon last call into our tick() method? Used for detecting
+ // when the mon was not updating us for some period (e.g. during slow
+ // election) to reset last_beacon timeouts
+ ceph::coarse_mono_clock::time_point last_tick;
+};
+
+#endif
diff --git a/src/mon/MgrStatMonitor.cc b/src/mon/MgrStatMonitor.cc
new file mode 100644
index 000000000..9da4c50da
--- /dev/null
+++ b/src/mon/MgrStatMonitor.cc
@@ -0,0 +1,367 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "MgrStatMonitor.h"
+#include "mon/OSDMonitor.h"
+#include "mon/MgrMonitor.h"
+#include "mon/PGMap.h"
+#include "messages/MGetPoolStats.h"
+#include "messages/MGetPoolStatsReply.h"
+#include "messages/MMonMgrReport.h"
+#include "messages/MStatfs.h"
+#include "messages/MStatfsReply.h"
+#include "messages/MServiceMap.h"
+
+#include "include/ceph_assert.h" // re-clobber assert
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon)
+
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::ErasureCodeInterfaceRef;
+using ceph::ErasureCodeProfile;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_message;
+using ceph::mono_clock;
+using ceph::mono_time;
+
+static ostream& _prefix(std::ostream *_dout, Monitor &mon) {
+ return *_dout << "mon." << mon.name << "@" << mon.rank
+ << "(" << mon.get_state_name()
+ << ").mgrstat ";
+}
+
+MgrStatMonitor::MgrStatMonitor(Monitor &mn, Paxos &p, const string& service_name)
+ : PaxosService(mn, p, service_name)
+{
+}
+
+MgrStatMonitor::~MgrStatMonitor() = default;
+
+void MgrStatMonitor::create_initial()
+{
+ dout(10) << __func__ << dendl;
+ version = 0;
+ service_map.epoch = 1;
+ service_map.modified = ceph_clock_now();
+ pending_service_map_bl.clear();
+ encode(service_map, pending_service_map_bl, CEPH_FEATURES_ALL);
+}
+
+void MgrStatMonitor::update_from_paxos(bool *need_bootstrap)
+{
+ version = get_last_committed();
+ dout(10) << " " << version << dendl;
+ load_health();
+ bufferlist bl;
+ get_version(version, bl);
+ if (version) {
+ ceph_assert(bl.length());
+ try {
+ auto p = bl.cbegin();
+ decode(digest, p);
+ decode(service_map, p);
+ if (!p.end()) {
+ decode(progress_events, p);
+ }
+ dout(10) << __func__ << " v" << version
+ << " service_map e" << service_map.epoch
+ << " " << progress_events.size() << " progress events"
+ << dendl;
+ }
+ catch (ceph::buffer::error& e) {
+ derr << "failed to decode mgrstat state; luminous dev version? "
+ << e.what() << dendl;
+ }
+ }
+ check_subs();
+ update_logger();
+ mon.osdmon()->notify_new_pg_digest();
+}
+
+void MgrStatMonitor::update_logger()
+{
+ dout(20) << __func__ << dendl;
+
+ mon.cluster_logger->set(l_cluster_osd_bytes, digest.osd_sum.statfs.total);
+ mon.cluster_logger->set(l_cluster_osd_bytes_used,
+ digest.osd_sum.statfs.get_used_raw());
+ mon.cluster_logger->set(l_cluster_osd_bytes_avail,
+ digest.osd_sum.statfs.available);
+
+ mon.cluster_logger->set(l_cluster_num_pool, digest.pg_pool_sum.size());
+ uint64_t num_pg = 0;
+ for (auto i : digest.num_pg_by_pool) {
+ num_pg += i.second;
+ }
+ mon.cluster_logger->set(l_cluster_num_pg, num_pg);
+
+ unsigned active = 0, active_clean = 0, peering = 0;
+ for (auto p = digest.num_pg_by_state.begin();
+ p != digest.num_pg_by_state.end();
+ ++p) {
+ if (p->first & PG_STATE_ACTIVE) {
+ active += p->second;
+ if (p->first & PG_STATE_CLEAN)
+ active_clean += p->second;
+ }
+ if (p->first & PG_STATE_PEERING)
+ peering += p->second;
+ }
+ mon.cluster_logger->set(l_cluster_num_pg_active_clean, active_clean);
+ mon.cluster_logger->set(l_cluster_num_pg_active, active);
+ mon.cluster_logger->set(l_cluster_num_pg_peering, peering);
+
+ mon.cluster_logger->set(l_cluster_num_object, digest.pg_sum.stats.sum.num_objects);
+ mon.cluster_logger->set(l_cluster_num_object_degraded, digest.pg_sum.stats.sum.num_objects_degraded);
+ mon.cluster_logger->set(l_cluster_num_object_misplaced, digest.pg_sum.stats.sum.num_objects_misplaced);
+ mon.cluster_logger->set(l_cluster_num_object_unfound, digest.pg_sum.stats.sum.num_objects_unfound);
+ mon.cluster_logger->set(l_cluster_num_bytes, digest.pg_sum.stats.sum.num_bytes);
+
+}
+
+void MgrStatMonitor::create_pending()
+{
+ dout(10) << " " << version << dendl;
+ pending_digest = digest;
+ pending_health_checks = get_health_checks();
+ pending_service_map_bl.clear();
+ encode(service_map, pending_service_map_bl, mon.get_quorum_con_features());
+}
+
+void MgrStatMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+ ++version;
+ dout(10) << " " << version << dendl;
+ bufferlist bl;
+ encode(pending_digest, bl, mon.get_quorum_con_features());
+ ceph_assert(pending_service_map_bl.length());
+ bl.append(pending_service_map_bl);
+ encode(pending_progress_events, bl);
+ put_version(t, version, bl);
+ put_last_committed(t, version);
+
+ encode_health(pending_health_checks, t);
+}
+
+version_t MgrStatMonitor::get_trim_to() const
+{
+ // we don't actually need *any* old states, but keep a few.
+ if (version > 5) {
+ return version - 5;
+ }
+ return 0;
+}
+
+void MgrStatMonitor::on_active()
+{
+ update_logger();
+}
+
+void MgrStatMonitor::tick()
+{
+}
+
+bool MgrStatMonitor::preprocess_query(MonOpRequestRef op)
+{
+ auto m = op->get_req<PaxosServiceMessage>();
+ switch (m->get_type()) {
+ case CEPH_MSG_STATFS:
+ return preprocess_statfs(op);
+ case MSG_MON_MGR_REPORT:
+ return preprocess_report(op);
+ case MSG_GETPOOLSTATS:
+ return preprocess_getpoolstats(op);
+ default:
+ mon.no_reply(op);
+ derr << "Unhandled message type " << m->get_type() << dendl;
+ return true;
+ }
+}
+
+bool MgrStatMonitor::prepare_update(MonOpRequestRef op)
+{
+ auto m = op->get_req<PaxosServiceMessage>();
+ switch (m->get_type()) {
+ case MSG_MON_MGR_REPORT:
+ return prepare_report(op);
+ default:
+ mon.no_reply(op);
+ derr << "Unhandled message type " << m->get_type() << dendl;
+ return true;
+ }
+}
+
+bool MgrStatMonitor::preprocess_report(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonMgrReport>();
+ mon.no_reply(op);
+ if (m->gid &&
+ m->gid != mon.mgrmon()->get_map().get_active_gid()) {
+ dout(10) << "ignoring report from non-active mgr " << m->gid
+ << dendl;
+ return true;
+ }
+ return false;
+}
+
+bool MgrStatMonitor::prepare_report(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonMgrReport>();
+ bufferlist bl = m->get_data();
+ auto p = bl.cbegin();
+ decode(pending_digest, p);
+ pending_health_checks.swap(m->health_checks);
+ if (m->service_map_bl.length()) {
+ pending_service_map_bl.swap(m->service_map_bl);
+ }
+ pending_progress_events.swap(m->progress_events);
+ dout(10) << __func__ << " " << pending_digest << ", "
+ << pending_health_checks.checks.size() << " health checks, "
+ << progress_events.size() << " progress events" << dendl;
+ dout(20) << "pending_digest:\n";
+ JSONFormatter jf(true);
+ jf.open_object_section("pending_digest");
+ pending_digest.dump(&jf);
+ jf.close_section();
+ jf.flush(*_dout);
+ *_dout << dendl;
+ dout(20) << "health checks:\n";
+ JSONFormatter jf(true);
+ jf.open_object_section("health_checks");
+ pending_health_checks.dump(&jf);
+ jf.close_section();
+ jf.flush(*_dout);
+ *_dout << dendl;
+ dout(20) << "progress events:\n";
+ JSONFormatter jf(true);
+ jf.open_object_section("progress_events");
+ for (auto& i : pending_progress_events) {
+ jf.dump_object(i.first.c_str(), i.second);
+ }
+ jf.close_section();
+ jf.flush(*_dout);
+ *_dout << dendl;
+ return true;
+}
+
+bool MgrStatMonitor::preprocess_getpoolstats(MonOpRequestRef op)
+{
+ op->mark_pgmon_event(__func__);
+ auto m = op->get_req<MGetPoolStats>();
+ auto session = op->get_session();
+ if (!session)
+ return true;
+ if (!session->is_capable("pg", MON_CAP_R)) {
+ dout(0) << "MGetPoolStats received from entity with insufficient caps "
+ << session->caps << dendl;
+ return true;
+ }
+ if (m->fsid != mon.monmap->fsid) {
+ dout(0) << __func__ << " on fsid "
+ << m->fsid << " != " << mon.monmap->fsid << dendl;
+ return true;
+ }
+ epoch_t ver = get_last_committed();
+ auto reply = new MGetPoolStatsReply(m->fsid, m->get_tid(), ver);
+ reply->per_pool = digest.use_per_pool_stats();
+ for (const auto& pool_name : m->pools) {
+ const auto pool_id = mon.osdmon()->osdmap.lookup_pg_pool_name(pool_name);
+ if (pool_id == -ENOENT)
+ continue;
+ auto pool_stat = get_pool_stat(pool_id);
+ if (!pool_stat)
+ continue;
+ reply->pool_stats[pool_name] = *pool_stat;
+ }
+ mon.send_reply(op, reply);
+ return true;
+}
+
+bool MgrStatMonitor::preprocess_statfs(MonOpRequestRef op)
+{
+ op->mark_pgmon_event(__func__);
+ auto statfs = op->get_req<MStatfs>();
+ auto session = op->get_session();
+
+ if (!session)
+ return true;
+ if (!session->is_capable("pg", MON_CAP_R)) {
+ dout(0) << "MStatfs received from entity with insufficient privileges "
+ << session->caps << dendl;
+ return true;
+ }
+ if (statfs->fsid != mon.monmap->fsid) {
+ dout(0) << __func__ << " on fsid " << statfs->fsid
+ << " != " << mon.monmap->fsid << dendl;
+ return true;
+ }
+ const auto& pool = statfs->data_pool;
+ if (pool && !mon.osdmon()->osdmap.have_pg_pool(*pool)) {
+ // There's no error field for MStatfsReply so just ignore the request.
+ // This is known to happen when a client is still accessing a removed fs.
+ dout(1) << __func__ << " on removed pool " << *pool << dendl;
+ return true;
+ }
+ dout(10) << __func__ << " " << *statfs
+ << " from " << statfs->get_orig_source() << dendl;
+ epoch_t ver = get_last_committed();
+ auto reply = new MStatfsReply(statfs->fsid, statfs->get_tid(), ver);
+ reply->h.st = get_statfs(mon.osdmon()->osdmap, pool);
+ mon.send_reply(op, reply);
+ return true;
+}
+
+void MgrStatMonitor::check_sub(Subscription *sub)
+{
+ dout(10) << __func__
+ << " next " << sub->next
+ << " vs service_map.epoch " << service_map.epoch << dendl;
+ if (sub->next <= service_map.epoch) {
+ auto m = new MServiceMap(service_map);
+ sub->session->con->send_message(m);
+ if (sub->onetime) {
+ mon.with_session_map([sub](MonSessionMap& session_map) {
+ session_map.remove_sub(sub);
+ });
+ } else {
+ sub->next = service_map.epoch + 1;
+ }
+ }
+}
+
+void MgrStatMonitor::check_subs()
+{
+ dout(10) << __func__ << dendl;
+ if (!service_map.epoch) {
+ return;
+ }
+ auto subs = mon.session_map.subs.find("servicemap");
+ if (subs == mon.session_map.subs.end()) {
+ return;
+ }
+ auto p = subs->second->begin();
+ while (!p.end()) {
+ auto sub = *p;
+ ++p;
+ check_sub(sub);
+ }
+}
diff --git a/src/mon/MgrStatMonitor.h b/src/mon/MgrStatMonitor.h
new file mode 100644
index 000000000..8f27a9878
--- /dev/null
+++ b/src/mon/MgrStatMonitor.h
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "include/Context.h"
+#include "PaxosService.h"
+#include "mon/PGMap.h"
+#include "mgr/ServiceMap.h"
+
+class MgrStatMonitor : public PaxosService {
+ // live version
+ version_t version = 0;
+ PGMapDigest digest;
+ ServiceMap service_map;
+ std::map<std::string,ProgressEvent> progress_events;
+
+ // pending commit
+ PGMapDigest pending_digest;
+ health_check_map_t pending_health_checks;
+ std::map<std::string,ProgressEvent> pending_progress_events;
+ ceph::buffer::list pending_service_map_bl;
+
+public:
+ MgrStatMonitor(Monitor &mn, Paxos &p, const std::string& service_name);
+ ~MgrStatMonitor() override;
+
+ void init() override {}
+ void on_shutdown() override {}
+
+ void create_initial() override;
+ void update_from_paxos(bool *need_bootstrap) override;
+ void create_pending() override;
+ void encode_pending(MonitorDBStore::TransactionRef t) override;
+ version_t get_trim_to() const override;
+
+ bool definitely_converted_snapsets() const {
+ return digest.definitely_converted_snapsets();
+ }
+
+ bool preprocess_query(MonOpRequestRef op) override;
+ bool prepare_update(MonOpRequestRef op) override;
+
+ void encode_full(MonitorDBStore::TransactionRef t) override { }
+
+ bool preprocess_report(MonOpRequestRef op);
+ bool prepare_report(MonOpRequestRef op);
+
+ bool preprocess_getpoolstats(MonOpRequestRef op);
+ bool preprocess_statfs(MonOpRequestRef op);
+
+ void check_sub(Subscription *sub);
+ void check_subs();
+ void send_digests();
+
+ void on_active() override;
+ void tick() override;
+
+ uint64_t get_last_osd_stat_seq(int osd) {
+ return digest.get_last_osd_stat_seq(osd);
+ }
+
+ void update_logger();
+
+ const ServiceMap& get_service_map() const {
+ return service_map;
+ }
+
+ const std::map<std::string,ProgressEvent>& get_progress_events() {
+ return progress_events;
+ }
+
+ // pg stat access
+ const pool_stat_t* get_pool_stat(int64_t poolid) const {
+ auto i = digest.pg_pool_sum.find(poolid);
+ if (i != digest.pg_pool_sum.end()) {
+ return &i->second;
+ }
+ return nullptr;
+ }
+
+ const PGMapDigest& get_digest() {
+ return digest;
+ }
+
+ ceph_statfs get_statfs(OSDMap& osdmap,
+ std::optional<int64_t> data_pool) const {
+ return digest.get_statfs(osdmap, data_pool);
+ }
+
+ void print_summary(ceph::Formatter *f, std::ostream *out) const {
+ digest.print_summary(f, out);
+ }
+ void dump_info(ceph::Formatter *f) const {
+ digest.dump(f);
+ f->dump_object("servicemap", get_service_map());
+ f->dump_unsigned("mgrstat_first_committed", get_first_committed());
+ f->dump_unsigned("mgrstat_last_committed", get_last_committed());
+ }
+ void dump_cluster_stats(std::stringstream *ss,
+ ceph::Formatter *f,
+ bool verbose) const {
+ digest.dump_cluster_stats(ss, f, verbose);
+ }
+ void dump_pool_stats(const OSDMap& osdm, std::stringstream *ss, ceph::Formatter *f,
+ bool verbose) const {
+ digest.dump_pool_stats_full(osdm, ss, f, verbose);
+ }
+};
diff --git a/src/mon/MonCap.cc b/src/mon/MonCap.cc
new file mode 100644
index 000000000..0d941f2c2
--- /dev/null
+++ b/src/mon/MonCap.cc
@@ -0,0 +1,751 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <boost/config/warning_disable.hpp>
+#include <boost/spirit/include/qi_uint.hpp>
+#include <boost/spirit/include/qi.hpp>
+#include <boost/fusion/include/std_pair.hpp>
+#include <boost/phoenix.hpp>
+#include <boost/fusion/adapted/struct/adapt_struct.hpp>
+#include <boost/fusion/include/adapt_struct.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "MonCap.h"
+#include "include/stringify.h"
+#include "include/ipaddr.h"
+#include "common/debug.h"
+#include "common/Formatter.h"
+
+#include <algorithm>
+#include <regex>
+
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_mon
+
+#undef dout_prefix
+#define dout_prefix *_dout << "MonCap "
+
+using std::list;
+using std::map;
+using std::ostream;
+using std::pair;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::Formatter;
+
+static inline bool is_not_alnum_space(char c)
+{
+ return !(isalpha(c) || isdigit(c) || (c == '-') || (c == '_'));
+}
+
+static std::string maybe_quote_string(const std::string& str)
+{
+ if (find_if(str.begin(), str.end(), is_not_alnum_space) == str.end())
+ return str;
+ return string("\"") + str + string("\"");
+}
+
+#define dout_subsys ceph_subsys_mon
+
+ostream& operator<<(ostream& out, const mon_rwxa_t& p)
+{
+ if (p == MON_CAP_ANY)
+ return out << "*";
+
+ if (p & MON_CAP_R)
+ out << "r";
+ if (p & MON_CAP_W)
+ out << "w";
+ if (p & MON_CAP_X)
+ out << "x";
+ return out;
+}
+
+ostream& operator<<(ostream& out, const StringConstraint& c)
+{
+ switch (c.match_type) {
+ case StringConstraint::MATCH_TYPE_EQUAL:
+ return out << "value " << c.value;
+ case StringConstraint::MATCH_TYPE_PREFIX:
+ return out << "prefix " << c.value;
+ case StringConstraint::MATCH_TYPE_REGEX:
+ return out << "regex " << c.value;
+ default:
+ break;
+ }
+ return out;
+}
+
+ostream& operator<<(ostream& out, const MonCapGrant& m)
+{
+ out << "allow";
+ if (m.service.length()) {
+ out << " service " << maybe_quote_string(m.service);
+ }
+ if (m.command.length()) {
+ out << " command " << maybe_quote_string(m.command);
+ if (!m.command_args.empty()) {
+ out << " with";
+ for (auto p = m.command_args.begin();
+ p != m.command_args.end();
+ ++p) {
+ switch (p->second.match_type) {
+ case StringConstraint::MATCH_TYPE_EQUAL:
+ out << " " << maybe_quote_string(p->first) << "="
+ << maybe_quote_string(p->second.value);
+ break;
+ case StringConstraint::MATCH_TYPE_PREFIX:
+ out << " " << maybe_quote_string(p->first) << " prefix "
+ << maybe_quote_string(p->second.value);
+ break;
+ case StringConstraint::MATCH_TYPE_REGEX:
+ out << " " << maybe_quote_string(p->first) << " regex "
+ << maybe_quote_string(p->second.value);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ }
+ if (m.profile.length()) {
+ out << " profile " << maybe_quote_string(m.profile);
+ }
+ if (m.allow != 0)
+ out << " " << m.allow;
+ if (m.network.size())
+ out << " network " << m.network;
+ return out;
+}
+
+
+// <magic>
+// fusion lets us easily populate structs via the qi parser.
+
+typedef map<string,StringConstraint> kvmap;
+
+BOOST_FUSION_ADAPT_STRUCT(MonCapGrant,
+ (std::string, service)
+ (std::string, profile)
+ (std::string, command)
+ (kvmap, command_args)
+ (mon_rwxa_t, allow)
+ (std::string, network)
+ (std::string, fs_name))
+
+BOOST_FUSION_ADAPT_STRUCT(StringConstraint,
+ (StringConstraint::MatchType, match_type)
+ (std::string, value))
+
+// </magic>
+
+void MonCapGrant::parse_network()
+{
+ network_valid = ::parse_network(network.c_str(), &network_parsed,
+ &network_prefix);
+}
+
+void MonCapGrant::expand_profile(const EntityName& name) const
+{
+ // only generate this list once
+ if (!profile_grants.empty())
+ return;
+
+ if (profile == "read-only") {
+ // grants READ-ONLY caps monitor-wide
+ // 'auth' requires MON_CAP_X even for RO, which we do not grant here.
+ profile_grants.push_back(mon_rwxa_t(MON_CAP_R));
+ return;
+ }
+
+ if (profile == "read-write") {
+ // grants READ-WRITE caps monitor-wide
+ // 'auth' requires MON_CAP_X for all operations, which we do not grant.
+ profile_grants.push_back(mon_rwxa_t(MON_CAP_R | MON_CAP_W));
+ return;
+ }
+
+ if (profile == "mon") {
+ profile_grants.push_back(MonCapGrant("mon", MON_CAP_ALL));
+ profile_grants.push_back(MonCapGrant("log", MON_CAP_ALL));
+ }
+ if (profile == "osd") {
+ profile_grants.push_back(MonCapGrant("osd", MON_CAP_ALL));
+ profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("pg", MON_CAP_R | MON_CAP_W));
+ profile_grants.push_back(MonCapGrant("log", MON_CAP_W));
+ StringConstraint constraint(StringConstraint::MATCH_TYPE_REGEX,
+ string("osd_mclock_max_capacity_iops_(hdd|ssd)"));
+ profile_grants.push_back(MonCapGrant("config set", "name", constraint));
+ constraint = StringConstraint(StringConstraint::MATCH_TYPE_REGEX,
+ string("^(osd_max_backfills|") +
+ string("osd_recovery_max_active(.*)|") +
+ string("osd_mclock_scheduler_(.*))"));
+ profile_grants.push_back(MonCapGrant("config rm", "name", constraint));
+ }
+ if (profile == "mds") {
+ profile_grants.push_back(MonCapGrant("mds", MON_CAP_ALL));
+ profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));
+ // This command grant is checked explicitly in MRemoveSnaps handling
+ profile_grants.push_back(MonCapGrant("osd pool rmsnap"));
+ profile_grants.push_back(MonCapGrant("osd blocklist"));
+ profile_grants.push_back(MonCapGrant("osd blacklist")); // for compat
+ profile_grants.push_back(MonCapGrant("log", MON_CAP_W));
+ }
+ if (profile == "mgr") {
+ profile_grants.push_back(MonCapGrant("mgr", MON_CAP_ALL));
+ profile_grants.push_back(MonCapGrant("log", MON_CAP_R | MON_CAP_W));
+ profile_grants.push_back(MonCapGrant("mon", MON_CAP_R | MON_CAP_W));
+ profile_grants.push_back(MonCapGrant("mds", MON_CAP_R | MON_CAP_W));
+ profile_grants.push_back(MonCapGrant("fs", MON_CAP_R | MON_CAP_W));
+ profile_grants.push_back(MonCapGrant("osd", MON_CAP_R | MON_CAP_W));
+ profile_grants.push_back(MonCapGrant("auth", MON_CAP_R | MON_CAP_W | MON_CAP_X));
+ profile_grants.push_back(MonCapGrant("config-key", MON_CAP_R | MON_CAP_W));
+ profile_grants.push_back(MonCapGrant("config", MON_CAP_R | MON_CAP_W));
+ // cephadm orchestrator provisions new daemon keys and updates caps
+ profile_grants.push_back(MonCapGrant("auth get-or-create"));
+ profile_grants.push_back(MonCapGrant("auth caps"));
+ profile_grants.push_back(MonCapGrant("auth rm"));
+ // tell commands (this is a bit of a kludge)
+ profile_grants.push_back(MonCapGrant("smart"));
+ // allow the Telemetry module to gather heap and mempool metrics
+ profile_grants.push_back(MonCapGrant("heap"));
+ profile_grants.push_back(MonCapGrant("dump_mempools"));
+ }
+ if (profile == "osd" || profile == "mds" || profile == "mon" ||
+ profile == "mgr") {
+ StringConstraint constraint(StringConstraint::MATCH_TYPE_PREFIX,
+ string("daemon-private/") + stringify(name) +
+ string("/"));
+ std::string prefix = string("daemon-private/") + stringify(name) + string("/");
+ profile_grants.push_back(MonCapGrant("config-key get", "key", constraint));
+ profile_grants.push_back(MonCapGrant("config-key put", "key", constraint));
+ profile_grants.push_back(MonCapGrant("config-key set", "key", constraint));
+ profile_grants.push_back(MonCapGrant("config-key exists", "key", constraint));
+ profile_grants.push_back(MonCapGrant("config-key delete", "key", constraint));
+ }
+ if (profile == "bootstrap-osd") {
+ profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap
+ profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); // read osdmap
+ profile_grants.push_back(MonCapGrant("mon getmap"));
+ profile_grants.push_back(MonCapGrant("osd new"));
+ profile_grants.push_back(MonCapGrant("osd purge-new"));
+ }
+ if (profile == "bootstrap-mds") {
+ profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap
+ profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); // read osdmap
+ profile_grants.push_back(MonCapGrant("mon getmap"));
+ profile_grants.push_back(MonCapGrant("auth get-or-create")); // FIXME: this can expose other mds keys
+ profile_grants.back().command_args["entity"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_PREFIX, "mds.");
+ profile_grants.back().command_args["caps_mon"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_EQUAL, "allow profile mds");
+ profile_grants.back().command_args["caps_osd"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_EQUAL, "allow rwx");
+ profile_grants.back().command_args["caps_mds"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_EQUAL, "allow");
+ }
+ if (profile == "bootstrap-mgr") {
+ profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap
+ profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); // read osdmap
+ profile_grants.push_back(MonCapGrant("mon getmap"));
+ profile_grants.push_back(MonCapGrant("auth get-or-create")); // FIXME: this can expose other mgr keys
+ profile_grants.back().command_args["entity"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_PREFIX, "mgr.");
+ profile_grants.back().command_args["caps_mon"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_EQUAL, "allow profile mgr");
+ }
+ if (profile == "bootstrap-rgw") {
+ profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap
+ profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); // read osdmap
+ profile_grants.push_back(MonCapGrant("mon getmap"));
+ profile_grants.push_back(MonCapGrant("auth get-or-create")); // FIXME: this can expose other mds keys
+ profile_grants.back().command_args["entity"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_PREFIX, "client.rgw.");
+ profile_grants.back().command_args["caps_mon"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_EQUAL, "allow rw");
+ profile_grants.back().command_args["caps_osd"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_EQUAL, "allow rwx");
+ }
+ if (profile == "bootstrap-rbd" || profile == "bootstrap-rbd-mirror") {
+ profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap
+ profile_grants.push_back(MonCapGrant("auth get-or-create")); // FIXME: this can expose other rbd keys
+ profile_grants.back().command_args["entity"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_PREFIX, "client.");
+ profile_grants.back().command_args["caps_mon"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_EQUAL,
+ (profile == "bootstrap-rbd-mirror" ? "profile rbd-mirror" :
+ "profile rbd"));
+ profile_grants.back().command_args["caps_osd"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_REGEX,
+ "^([ ,]*profile(=|[ ]+)['\"]?rbd[^ ,'\"]*['\"]?([ ]+pool(=|[ ]+)['\"]?[^,'\"]+['\"]?)?)+$");
+ }
+ if (profile == "fs-client") {
+ profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("mds", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("pg", MON_CAP_R));
+ }
+ if (profile == "simple-rados-client") {
+ profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("pg", MON_CAP_R));
+ }
+ if (profile == "simple-rados-client-with-blocklist") {
+ profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("pg", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("osd blocklist"));
+ profile_grants.back().command_args["blocklistop"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_EQUAL, "add");
+ profile_grants.back().command_args["addr"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_REGEX, "^[^/]+/[0-9]+$");
+
+ }
+ if (boost::starts_with(profile, "rbd")) {
+ profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("pg", MON_CAP_R));
+
+ // exclusive lock dead-client blocklisting (IP+nonce required)
+ profile_grants.push_back(MonCapGrant("osd blocklist"));
+ profile_grants.back().command_args["blocklistop"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_EQUAL, "add");
+ profile_grants.back().command_args["addr"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_REGEX, "^[^/]+/[0-9]+$");
+
+ // for compat,
+ profile_grants.push_back(MonCapGrant("osd blacklist"));
+ profile_grants.back().command_args["blacklistop"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_EQUAL, "add");
+ profile_grants.back().command_args["addr"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_REGEX, "^[^/]+/[0-9]+$");
+
+ }
+ if (profile == "rbd-mirror") {
+ StringConstraint constraint(StringConstraint::MATCH_TYPE_PREFIX,
+ "rbd/mirror/");
+ profile_grants.push_back(MonCapGrant("config-key get", "key", constraint));
+ } else if (profile == "rbd-mirror-peer") {
+ StringConstraint constraint(StringConstraint::MATCH_TYPE_REGEX,
+ "rbd/mirror/[^/]+");
+ profile_grants.push_back(MonCapGrant("config-key get", "key", constraint));
+
+ constraint = StringConstraint(StringConstraint::MATCH_TYPE_PREFIX,
+ "rbd/mirror/peer/");
+ profile_grants.push_back(MonCapGrant("config-key set", "key", constraint));
+ }
+ else if (profile == "crash") {
+ // TODO: we could limit this to getting the monmap and mgrmap...
+ profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+ }
+ if (profile == "cephfs-mirror") {
+ profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("mds", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("pg", MON_CAP_R));
+ StringConstraint constraint(StringConstraint::MATCH_TYPE_PREFIX,
+ "cephfs/mirror/peer/");
+ profile_grants.push_back(MonCapGrant("config-key get", "key", constraint));
+
+ }
+ if (profile == "role-definer") {
+ // grants ALL caps to the auth subsystem, read-only on the
+ // monitor subsystem and nothing else.
+ profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("auth", MON_CAP_ALL));
+ }
+}
+
+mon_rwxa_t MonCapGrant::get_allowed(CephContext *cct,
+ EntityName name,
+ const std::string& s, const std::string& c,
+ const map<string,string>& c_args) const
+{
+ if (profile.length()) {
+ expand_profile(name);
+ mon_rwxa_t a;
+ for (auto p = profile_grants.begin();
+ p != profile_grants.end(); ++p)
+ a = a | p->get_allowed(cct, name, s, c, c_args);
+ return a;
+ }
+ if (service.length()) {
+ if (service != s)
+ return 0;
+ return allow;
+ }
+ if (command.length()) {
+ if (command != c)
+ return 0;
+ for (map<string,StringConstraint>::const_iterator p = command_args.begin(); p != command_args.end(); ++p) {
+ map<string,string>::const_iterator q = c_args.find(p->first);
+ // argument must be present if a constraint exists
+ if (q == c_args.end())
+ return 0;
+ switch (p->second.match_type) {
+ case StringConstraint::MATCH_TYPE_EQUAL:
+ if (p->second.value != q->second)
+ return 0;
+ break;
+ case StringConstraint::MATCH_TYPE_PREFIX:
+ if (q->second.find(p->second.value) != 0)
+ return 0;
+ break;
+ case StringConstraint::MATCH_TYPE_REGEX:
+ try {
+ std::regex pattern(
+ p->second.value, std::regex::extended);
+ if (!std::regex_match(q->second, pattern))
+ return 0;
+ } catch(const std::regex_error&) {
+ return 0;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ return MON_CAP_ALL;
+ }
+ // we don't allow config-key service to be accessed with blanket caps other
+ // than '*' (i.e., 'any'), and that should have been checked by the caller
+ // via 'is_allow_all()'.
+ if (s == "config-key") {
+ return 0;
+ }
+ return allow;
+}
+
+ostream& operator<<(ostream&out, const MonCap& m)
+{
+ for (vector<MonCapGrant>::const_iterator p = m.grants.begin(); p != m.grants.end(); ++p) {
+ if (p != m.grants.begin())
+ out << ", ";
+ out << *p;
+ }
+ return out;
+}
+
+bool MonCap::is_allow_all() const
+{
+ for (vector<MonCapGrant>::const_iterator p = grants.begin(); p != grants.end(); ++p)
+ if (p->is_allow_all())
+ return true;
+ return false;
+}
+
+void MonCap::set_allow_all()
+{
+ grants.clear();
+ grants.push_back(MonCapGrant(MON_CAP_ANY));
+ text = "allow *";
+}
+
+bool MonCap::is_capable(
+ CephContext *cct,
+ EntityName name,
+ const string& service,
+ const string& command, const map<string,string>& command_args,
+ bool op_may_read, bool op_may_write, bool op_may_exec,
+ const entity_addr_t& addr) const
+{
+ if (cct)
+ ldout(cct, 20) << "is_capable service=" << service << " command=" << command
+ << (op_may_read ? " read":"")
+ << (op_may_write ? " write":"")
+ << (op_may_exec ? " exec":"")
+ << " addr " << addr
+ << " on cap " << *this
+ << dendl;
+
+ mon_rwxa_t allow = 0;
+ for (vector<MonCapGrant>::const_iterator p = grants.begin();
+ p != grants.end(); ++p) {
+ if (cct)
+ ldout(cct, 20) << " allow so far " << allow << ", doing grant " << *p
+ << dendl;
+
+ if (p->network.size() &&
+ (!p->network_valid ||
+ !network_contains(p->network_parsed,
+ p->network_prefix,
+ addr))) {
+ continue;
+ }
+
+ if (p->is_allow_all()) {
+ if (cct)
+ ldout(cct, 20) << " allow all" << dendl;
+ return true;
+ }
+
+ // check enumerated caps
+ allow = allow | p->get_allowed(cct, name, service, command, command_args);
+ if ((!op_may_read || (allow & MON_CAP_R)) &&
+ (!op_may_write || (allow & MON_CAP_W)) &&
+ (!op_may_exec || (allow & MON_CAP_X))) {
+ if (cct)
+ ldout(cct, 20) << " match" << dendl;
+ return true;
+ }
+ }
+ return false;
+}
+
+void MonCap::encode(bufferlist& bl) const
+{
+ ENCODE_START(4, 4, bl); // legacy MonCaps was 3, 3
+ encode(text, bl);
+ ENCODE_FINISH(bl);
+}
+
+void MonCap::decode(bufferlist::const_iterator& bl)
+{
+ std::string s;
+ DECODE_START(4, bl);
+ decode(s, bl);
+ DECODE_FINISH(bl);
+ parse(s, NULL);
+}
+
+void MonCap::dump(Formatter *f) const
+{
+ f->dump_string("text", text);
+}
+
+void MonCap::generate_test_instances(list<MonCap*>& ls)
+{
+ ls.push_back(new MonCap);
+ ls.push_back(new MonCap);
+ ls.back()->parse("allow *");
+ ls.push_back(new MonCap);
+ ls.back()->parse("allow rwx");
+ ls.push_back(new MonCap);
+ ls.back()->parse("allow service foo x");
+ ls.push_back(new MonCap);
+ ls.back()->parse("allow command bar x");
+ ls.push_back(new MonCap);
+ ls.back()->parse("allow service foo r, allow command bar x");
+ ls.push_back(new MonCap);
+ ls.back()->parse("allow command bar with k1=v1 x");
+ ls.push_back(new MonCap);
+ ls.back()->parse("allow command bar with k1=v1 k2=v2 x");
+}
+
+// grammar
+namespace qi = boost::spirit::qi;
+namespace ascii = boost::spirit::ascii;
+namespace phoenix = boost::phoenix;
+
+
+template <typename Iterator>
+struct MonCapParser : qi::grammar<Iterator, MonCap()>
+{
+ MonCapParser() : MonCapParser::base_type(moncap)
+ {
+ using qi::char_;
+ using qi::int_;
+ using qi::ulong_long;
+ using qi::lexeme;
+ using qi::alnum;
+ using qi::_val;
+ using qi::_1;
+ using qi::_2;
+ using qi::_3;
+ using qi::eps;
+ using qi::lit;
+
+ quoted_string %=
+ lexeme['"' >> +(char_ - '"') >> '"'] |
+ lexeme['\'' >> +(char_ - '\'') >> '\''];
+ unquoted_word %= +char_("a-zA-Z0-9_./-");
+ str %= quoted_string | unquoted_word;
+ network_str %= +char_("/.:a-fA-F0-9][");
+ fs_name_str %= +char_("a-zA-Z0-9_.-");
+
+ spaces = +(lit(' ') | lit('\n') | lit('\t'));
+
+ // command := command[=]cmd [k1=v1 k2=v2 ...]
+ str_match = '=' >> qi::attr(StringConstraint::MATCH_TYPE_EQUAL) >> str;
+ str_prefix = spaces >> lit("prefix") >> spaces >>
+ qi::attr(StringConstraint::MATCH_TYPE_PREFIX) >> str;
+ str_regex = spaces >> lit("regex") >> spaces >>
+ qi::attr(StringConstraint::MATCH_TYPE_REGEX) >> str;
+ kv_pair = str >> (str_match | str_prefix | str_regex);
+ kv_map %= kv_pair >> *(spaces >> kv_pair);
+ command_match = -spaces >> lit("allow") >> spaces >> lit("command") >> (lit('=') | spaces)
+ >> qi::attr(string()) >> qi::attr(string())
+ >> str
+ >> -(spaces >> lit("with") >> spaces >> kv_map)
+ >> qi::attr(0)
+ >> -(spaces >> lit("network") >> spaces >> network_str);
+
+ // service foo rwxa
+ service_match %= -spaces >> lit("allow") >> spaces >> lit("service") >> (lit('=') | spaces)
+ >> str >> qi::attr(string()) >> qi::attr(string())
+ >> qi::attr(map<string,StringConstraint>())
+ >> spaces >> rwxa
+ >> -(spaces >> lit("network") >> spaces >> network_str);
+
+ // profile foo
+ profile_match %= -spaces >> -(lit("allow") >> spaces)
+ >> lit("profile") >> (lit('=') | spaces)
+ >> qi::attr(string())
+ >> str
+ >> qi::attr(string())
+ >> qi::attr(map<string,StringConstraint>())
+ >> qi::attr(0)
+ >> -(spaces >> lit("network") >> spaces >> network_str);
+
+ // rwxa
+ rwxa_match %= -spaces >> lit("allow") >> spaces
+ >> qi::attr(string()) >> qi::attr(string()) >> qi::attr(string())
+ >> qi::attr(map<string,StringConstraint>())
+ >> rwxa
+ >> -(spaces >> lit("network") >> spaces >> network_str)
+ >> -(spaces >> lit("fsname") >> (lit('=') | spaces) >> fs_name_str);
+
+ // rwxa := * | [r][w][x]
+ rwxa =
+ (lit("*")[_val = MON_CAP_ANY]) |
+ (lit("all")[_val = MON_CAP_ANY]) |
+ ( eps[_val = 0] >>
+ ( lit('r')[_val |= MON_CAP_R] ||
+ lit('w')[_val |= MON_CAP_W] ||
+ lit('x')[_val |= MON_CAP_X]
+ )
+ );
+
+ // grant := allow ...
+ grant = -spaces >> (rwxa_match | profile_match | service_match | command_match) >> -spaces;
+
+ // moncap := grant [grant ...]
+ grants %= (grant % (*lit(' ') >> (lit(';') | lit(',')) >> *lit(' ')));
+ moncap = grants [_val = phoenix::construct<MonCap>(_1)];
+
+ }
+ qi::rule<Iterator> spaces;
+ qi::rule<Iterator, unsigned()> rwxa;
+ qi::rule<Iterator, string()> quoted_string;
+ qi::rule<Iterator, string()> unquoted_word;
+ qi::rule<Iterator, string()> str, network_str;
+ qi::rule<Iterator, string()> fs_name_str;
+
+ qi::rule<Iterator, StringConstraint()> str_match, str_prefix, str_regex;
+ qi::rule<Iterator, pair<string, StringConstraint>()> kv_pair;
+ qi::rule<Iterator, map<string, StringConstraint>()> kv_map;
+
+ qi::rule<Iterator, MonCapGrant()> rwxa_match;
+ qi::rule<Iterator, MonCapGrant()> command_match;
+ qi::rule<Iterator, MonCapGrant()> service_match;
+ qi::rule<Iterator, MonCapGrant()> profile_match;
+ qi::rule<Iterator, MonCapGrant()> grant;
+ qi::rule<Iterator, std::vector<MonCapGrant>()> grants;
+ qi::rule<Iterator, MonCap()> moncap;
+};
+
+bool MonCap::parse(const string& str, ostream *err)
+{
+ auto iter = str.begin();
+ auto end = str.end();
+
+ MonCapParser<string::const_iterator> exp;
+ bool r = qi::parse(iter, end, exp, *this);
+ if (r && iter == end) {
+ text = str;
+ for (auto& g : grants) {
+ g.parse_network();
+ }
+ return true;
+ }
+
+ // Make sure no grants are kept after parsing failed!
+ grants.clear();
+
+ if (err) {
+ if (iter != end)
+ *err << "mon capability parse failed, stopped at '"
+ << std::string(iter, end)
+ << "' of '" << str << "'";
+ else
+ *err << "mon capability parse failed, stopped at end of '" << str << "'";
+ }
+
+ return false;
+}
+
+bool MonCap::merge(MonCap newcap)
+{
+ ceph_assert(newcap.grants.size() == 1);
+ auto& ng = newcap.grants[0];
+
+ for (auto& g : grants) {
+ /* TODO: check case where cap is "allow rw *". */
+
+ if (g.fs_name == ng.fs_name) {
+ if (g.allow == ng.allow) {
+ // no update required; maintain idempotency.
+ return false;
+ } else {
+ // cap for given fs name is present, let's update it.
+ g.allow = ng.allow;
+ return true;
+ }
+ }
+ }
+
+ // cap for given fs name is absent, let's add a new cap for it.
+ grants.push_back(MonCapGrant(ng.allow, ng.fs_name));
+ return true;
+}
+
+string MonCapGrant::to_string()
+{
+ string str = "allow ";
+
+ if (allow & MON_CAP_R) {
+ str+= "r";
+ } else if (allow & MON_CAP_W) {
+ str+= "w";
+ } else if (allow & MON_CAP_X) {
+ str+= "x";
+ } else if (allow == MON_CAP_ANY) {
+ str+= "*";
+ }
+
+ if (not fs_name.empty()) {
+ str += " fsname=" + fs_name;
+ }
+
+ return str;
+}
+
+string MonCap::to_string()
+{
+ string str;
+
+ for (size_t i = 0; i < grants.size(); ++i) {
+ str += grants[i].to_string();
+ if (i < grants.size () - 1) {
+ str += ", ";
+ }
+ }
+
+ return str;
+}
diff --git a/src/mon/MonCap.h b/src/mon/MonCap.h
new file mode 100644
index 000000000..45063dbed
--- /dev/null
+++ b/src/mon/MonCap.h
@@ -0,0 +1,231 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MONCAP_H
+#define CEPH_MONCAP_H
+
+#include <ostream>
+
+#include "include/common_fwd.h"
+#include "include/types.h"
+#include "common/entity_name.h"
+#include "mds/mdstypes.h"
+
+static const __u8 MON_CAP_R = (1 << 1); // read
+static const __u8 MON_CAP_W = (1 << 2); // write
+static const __u8 MON_CAP_X = (1 << 3); // execute
+static const __u8 MON_CAP_ALL = MON_CAP_R | MON_CAP_W | MON_CAP_X;
+static const __u8 MON_CAP_ANY = 0xff; // *
+
+struct mon_rwxa_t {
+ __u8 val;
+
+ // cppcheck-suppress noExplicitConstructor
+ mon_rwxa_t(__u8 v = 0) : val(v) {}
+ mon_rwxa_t& operator=(__u8 v) {
+ val = v;
+ return *this;
+ }
+ operator __u8() const {
+ return val;
+ }
+};
+
+std::ostream& operator<<(std::ostream& out, const mon_rwxa_t& p);
+
+struct StringConstraint {
+ enum MatchType {
+ MATCH_TYPE_NONE,
+ MATCH_TYPE_EQUAL,
+ MATCH_TYPE_PREFIX,
+ MATCH_TYPE_REGEX
+ };
+
+ MatchType match_type = MATCH_TYPE_NONE;
+ std::string value;
+
+ StringConstraint() {}
+ StringConstraint(MatchType match_type, std::string value)
+ : match_type(match_type), value(value) {
+ }
+};
+
+std::ostream& operator<<(std::ostream& out, const StringConstraint& c);
+
+struct MonCapGrant {
+ /*
+ * A grant can come in one of five forms:
+ *
+ * - a blanket allow ('allow rw', 'allow *')
+ * - this will match against any service and the read/write/exec flags
+ * in the mon code. semantics of what X means are somewhat ad hoc.
+ *
+ * - a service allow ('allow service mds rw')
+ * - this will match against a specific service and the r/w/x flags.
+ *
+ * - a profile ('allow profile osd')
+ * - this will match against specific monitor-enforced semantics of what
+ * this type of user should need to do. examples include 'osd', 'mds',
+ * 'bootstrap-osd'.
+ *
+ * - a command ('allow command foo', 'allow command bar with arg1=val1 arg2 prefix val2')
+ * this includes the command name (the prefix string), and a set
+ * of key/value pairs that constrain use of that command. if no pairs
+ * are specified, any arguments are allowed; if a pair is specified, that
+ * argument must be present and equal or match a prefix.
+ *
+ * - an fs name ('allow fsname foo')
+ * - this will restrict access to MDSMaps in the FSMap to the provided
+ * fs name.
+ */
+ std::string service;
+ std::string profile;
+ std::string command;
+ std::map<std::string, StringConstraint> command_args;
+ std::string fs_name;
+
+ // restrict by network
+ std::string network;
+
+ // these are filled in by parse_network(), called by MonCap::parse()
+ entity_addr_t network_parsed;
+ unsigned network_prefix = 0;
+ bool network_valid = true;
+
+ void parse_network();
+
+ mon_rwxa_t allow;
+
+ // explicit grants that a profile grant expands to; populated as
+ // needed by expand_profile() (via is_match()) and cached here.
+ mutable std::list<MonCapGrant> profile_grants;
+
+ void expand_profile(const EntityName& name) const;
+
+ MonCapGrant() : allow(0) {}
+ // cppcheck-suppress noExplicitConstructor
+ MonCapGrant(mon_rwxa_t a) : allow(a) {}
+ MonCapGrant(std::string s, mon_rwxa_t a) : service(std::move(s)), allow(a) {}
+ // cppcheck-suppress noExplicitConstructor
+ MonCapGrant(std::string c) : command(std::move(c)) {}
+ MonCapGrant(std::string c, std::string a, StringConstraint co) : command(std::move(c)) {
+ command_args[a] = co;
+ }
+ MonCapGrant(mon_rwxa_t a, std::string fsname) : fs_name(fsname), allow(a) {}
+
+ /**
+ * check if given request parameters match our constraints
+ *
+ * @param cct context
+ * @param name entity name
+ * @param service service (if any)
+ * @param command command (if any)
+ * @param command_args command args (if any)
+ * @return bits we allow
+ */
+ mon_rwxa_t get_allowed(CephContext *cct,
+ EntityName name,
+ const std::string& service,
+ const std::string& command,
+ const std::map<std::string, std::string>& command_args) const;
+
+ bool is_allow_all() const {
+ return
+ allow == MON_CAP_ANY &&
+ service.length() == 0 &&
+ profile.length() == 0 &&
+ command.length() == 0 &&
+ fs_name.empty();
+ }
+
+ std::string to_string();
+};
+
+std::ostream& operator<<(std::ostream& out, const MonCapGrant& g);
+
+struct MonCap {
+ std::string text;
+ std::vector<MonCapGrant> grants;
+
+ MonCap() {}
+ explicit MonCap(const std::vector<MonCapGrant> &g) : grants(g) {}
+
+ std::string get_str() const {
+ return text;
+ }
+ std::string to_string();
+
+ bool is_allow_all() const;
+ void set_allow_all();
+ bool parse(const std::string& str, std::ostream *err=NULL);
+ bool merge(MonCap newcap);
+
+ /**
+ * check if we are capable of something
+ *
+ * This method actually checks a description of a particular operation against
+ * what the capability has specified.
+ *
+ * @param service service name
+ * @param command command id
+ * @param command_args
+ * @param op_may_read whether the operation may need to read
+ * @param op_may_write whether the operation may need to write
+ * @param op_may_exec whether the operation may exec
+ * @return true if the operation is allowed, false otherwise
+ */
+ bool is_capable(CephContext *cct,
+ EntityName name,
+ const std::string& service,
+ const std::string& command,
+ const std::map<std::string, std::string>& command_args,
+ bool op_may_read, bool op_may_write, bool op_may_exec,
+ const entity_addr_t& addr) const;
+
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<MonCap*>& ls);
+
+ std::vector<std::string> allowed_fs_names() const {
+ std::vector<std::string> ret;
+ for (auto& g : grants) {
+ if (not g.fs_name.empty()) {
+ ret.push_back(g.fs_name);
+ } else {
+ return {};
+ }
+ }
+ return ret;
+ }
+
+ bool fs_name_capable(const EntityName& ename, std::string_view fs_name,
+ __u8 mask) {
+ for (auto& g : grants) {
+ if (g.is_allow_all()) {
+ return true;
+ }
+
+ if ((g.fs_name.empty() || g.fs_name == fs_name) && (mask & g.allow)) {
+ return true;
+ }
+
+ g.expand_profile(ename);
+ for (auto& pg : g.profile_grants) {
+ if ((pg.service == "fs" || pg.service == "mds") &&
+ (pg.fs_name.empty() || pg.fs_name == fs_name) &&
+ (pg.allow & mask)) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+ }
+
+};
+WRITE_CLASS_ENCODER(MonCap)
+
+std::ostream& operator<<(std::ostream& out, const MonCap& cap);
+
+#endif
diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc
new file mode 100644
index 000000000..0667c078b
--- /dev/null
+++ b/src/mon/MonClient.cc
@@ -0,0 +1,2070 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <algorithm>
+#include <iterator>
+#include <random>
+
+#include <boost/asio/post.hpp>
+#include <boost/range/adaptor/map.hpp>
+#include <boost/range/adaptor/filtered.hpp>
+#include <boost/range/algorithm/copy.hpp>
+#include <boost/range/algorithm_ext/copy_n.hpp>
+#include "common/weighted_shuffle.h"
+
+#include "include/random.h"
+#include "include/scope_guard.h"
+#include "include/stringify.h"
+
+#include "messages/MMonGetMap.h"
+#include "messages/MMonGetVersion.h"
+#include "messages/MMonGetMap.h"
+#include "messages/MMonGetVersionReply.h"
+#include "messages/MMonMap.h"
+#include "messages/MConfig.h"
+#include "messages/MAuth.h"
+#include "messages/MLogAck.h"
+#include "messages/MAuthReply.h"
+#include "messages/MMonCommand.h"
+#include "messages/MMonCommandAck.h"
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+#include "messages/MPing.h"
+
+#include "messages/MMonSubscribe.h"
+#include "messages/MMonSubscribeAck.h"
+#include "common/errno.h"
+#include "common/hostname.h"
+#include "common/LogClient.h"
+
+#include "MonClient.h"
+#include "error_code.h"
+#include "MonMap.h"
+
+#include "auth/Auth.h"
+#include "auth/KeyRing.h"
+#include "auth/AuthClientHandler.h"
+#include "auth/AuthRegistry.h"
+#include "auth/RotatingKeyRing.h"
+
+#define dout_subsys ceph_subsys_monc
+#undef dout_prefix
+#define dout_prefix *_dout << "monclient" << (_hunting() ? "(hunting)":"") << ": "
+
+namespace bs = boost::system;
+using std::string;
+using namespace std::literals;
+
+MonClient::MonClient(CephContext *cct_, boost::asio::io_context& service) :
+ Dispatcher(cct_),
+ AuthServer(cct_),
+ messenger(NULL),
+ timer(cct_, monc_lock),
+ service(service),
+ initialized(false),
+ log_client(NULL),
+ more_log_pending(false),
+ want_monmap(true),
+ had_a_connection(false),
+ reopen_interval_multiplier(
+ cct_->_conf.get_val<double>("mon_client_hunt_interval_min_multiple")),
+ last_mon_command_tid(0),
+ version_req_id(0)
+{}
+
+MonClient::~MonClient()
+{
+}
+
+int MonClient::build_initial_monmap()
+{
+ ldout(cct, 10) << __func__ << dendl;
+ int r = monmap.build_initial(cct, false, std::cerr);
+ ldout(cct,10) << "monmap:\n";
+ monmap.print(*_dout);
+ *_dout << dendl;
+ return r;
+}
+
+int MonClient::get_monmap()
+{
+ ldout(cct, 10) << __func__ << dendl;
+ std::unique_lock l(monc_lock);
+
+ sub.want("monmap", 0, 0);
+ if (!_opened())
+ _reopen_session();
+ map_cond.wait(l, [this] { return !want_monmap; });
+ ldout(cct, 10) << __func__ << " done" << dendl;
+ return 0;
+}
+
+int MonClient::get_monmap_and_config()
+{
+ ldout(cct, 10) << __func__ << dendl;
+ ceph_assert(!messenger);
+
+ int tries = 10;
+
+ cct->init_crypto();
+ auto shutdown_crypto = make_scope_guard([this] {
+ cct->shutdown_crypto();
+ });
+
+ int r = build_initial_monmap();
+ if (r < 0) {
+ lderr(cct) << __func__ << " cannot identify monitors to contact" << dendl;
+ return r;
+ }
+
+ messenger = Messenger::create_client_messenger(
+ cct, "temp_mon_client");
+ ceph_assert(messenger);
+ messenger->add_dispatcher_head(this, Dispatcher::PRIORITY_HIGH);
+ messenger->start();
+ auto shutdown_msgr = make_scope_guard([this] {
+ messenger->shutdown();
+ messenger->wait();
+ delete messenger;
+ messenger = nullptr;
+ if (!monmap.fsid.is_zero()) {
+ cct->_conf.set_val("fsid", stringify(monmap.fsid));
+ }
+ });
+
+ want_bootstrap_config = true;
+ auto shutdown_config = make_scope_guard([this] {
+ std::unique_lock l(monc_lock);
+ want_bootstrap_config = false;
+ bootstrap_config.reset();
+ });
+
+ ceph::ref_t<MConfig> config;
+ while (tries-- > 0) {
+ r = init();
+ if (r < 0) {
+ return r;
+ }
+ r = authenticate(
+ cct->_conf.get_val<std::chrono::seconds>("client_mount_timeout").count());
+ if (r < 0) {
+ break;
+ }
+ {
+ std::unique_lock l(monc_lock);
+ if (monmap.get_epoch() &&
+ !monmap.persistent_features.contains_all(
+ ceph::features::mon::FEATURE_MIMIC)) {
+ ldout(cct,10) << __func__ << " pre-mimic monitor, no config to fetch"
+ << dendl;
+ r = 0;
+ break;
+ }
+ while ((!bootstrap_config || monmap.get_epoch() == 0) && r == 0) {
+ ldout(cct,20) << __func__ << " waiting for monmap|config" << dendl;
+ auto status = map_cond.wait_for(l, ceph::make_timespan(
+ cct->_conf->mon_client_hunt_interval));
+ if (status == std::cv_status::timeout) {
+ r = -ETIMEDOUT;
+ }
+ }
+
+ if (bootstrap_config) {
+ ldout(cct,10) << __func__ << " success" << dendl;
+ config = std::move(bootstrap_config);
+ r = 0;
+ break;
+ }
+ }
+ lderr(cct) << __func__ << " failed to get config" << dendl;
+ shutdown();
+ continue;
+ }
+
+ if (config) {
+ // apply the bootstrap config to ensure its applied prior to completing
+ // the bootstrap
+ cct->_conf.set_mon_vals(cct, config->config, config_cb);
+ }
+
+ shutdown();
+ return r;
+}
+
+
+/**
+ * Ping the monitor with id @p mon_id and set the resulting reply in
+ * the provided @p result_reply, if this last parameter is not NULL.
+ *
+ * So that we don't rely on the MonClient's default messenger, set up
+ * during connect(), we create our own messenger to comunicate with the
+ * specified monitor. This is advantageous in the following ways:
+ *
+ * - Isolate the ping procedure from the rest of the MonClient's operations,
+ * allowing us to not acquire or manage the big monc_lock, thus not
+ * having to block waiting for some other operation to finish before we
+ * can proceed.
+ * * for instance, we can ping mon.FOO even if we are currently hunting
+ * or blocked waiting for auth to complete with mon.BAR.
+ *
+ * - Ping a monitor prior to establishing a connection (using connect())
+ * and properly establish the MonClient's messenger. This frees us
+ * from dealing with the complex foo that happens in connect().
+ *
+ * We also don't rely on MonClient as a dispatcher for this messenger,
+ * unlike what happens with the MonClient's default messenger. This allows
+ * us to sandbox the whole ping, having it much as a separate entity in
+ * the MonClient class, considerably simplifying the handling and dispatching
+ * of messages without needing to consider monc_lock.
+ *
+ * Current drawback is that we will establish a messenger for each ping
+ * we want to issue, instead of keeping a single messenger instance that
+ * would be used for all pings.
+ */
+int MonClient::ping_monitor(const string &mon_id, string *result_reply)
+{
+ ldout(cct, 10) << __func__ << dendl;
+
+ string new_mon_id;
+ if (monmap.contains("noname-"+mon_id)) {
+ new_mon_id = "noname-"+mon_id;
+ } else {
+ new_mon_id = mon_id;
+ }
+
+ if (new_mon_id.empty()) {
+ ldout(cct, 10) << __func__ << " specified mon id is empty!" << dendl;
+ return -EINVAL;
+ } else if (!monmap.contains(new_mon_id)) {
+ ldout(cct, 10) << __func__ << " no such monitor 'mon." << new_mon_id << "'"
+ << dendl;
+ return -ENOENT;
+ }
+
+ // N.B. monc isn't initialized
+
+ auth_registry.refresh_config();
+
+ KeyRing keyring;
+ keyring.from_ceph_context(cct);
+ RotatingKeyRing rkeyring(cct, cct->get_module_type(), &keyring);
+
+ MonClientPinger *pinger = new MonClientPinger(cct,
+ &rkeyring,
+ result_reply);
+
+ Messenger *smsgr = Messenger::create_client_messenger(cct, "temp_ping_client");
+ smsgr->add_dispatcher_head(pinger, Dispatcher::PRIORITY_HIGH);
+ smsgr->set_auth_client(pinger);
+ smsgr->start();
+
+ ConnectionRef con = smsgr->connect_to_mon(monmap.get_addrs(new_mon_id));
+ ldout(cct, 10) << __func__ << " ping mon." << new_mon_id
+ << " " << con->get_peer_addr() << dendl;
+
+ pinger->mc.reset(new MonConnection(cct, con, 0, &auth_registry));
+ pinger->mc->start(monmap.get_epoch(), entity_name);
+ con->send_message(new MPing);
+
+ int ret = pinger->wait_for_reply(cct->_conf->mon_client_ping_timeout);
+ if (ret == 0) {
+ ldout(cct,10) << __func__ << " got ping reply" << dendl;
+ } else {
+ ret = -ret;
+ }
+
+ con->mark_down();
+ pinger->mc.reset();
+ smsgr->shutdown();
+ smsgr->wait();
+ delete smsgr;
+ delete pinger;
+ return ret;
+}
+
+bool MonClient::ms_dispatch(Message *m)
+{
+ ldout(cct, 25) << __func__ << " processing " << m << dendl;
+ // we only care about these message types
+ switch (m->get_type()) {
+ case CEPH_MSG_MON_MAP:
+ case CEPH_MSG_AUTH_REPLY:
+ case CEPH_MSG_MON_SUBSCRIBE_ACK:
+ case CEPH_MSG_MON_GET_VERSION_REPLY:
+ case MSG_MON_COMMAND_ACK:
+ case MSG_COMMAND_REPLY:
+ case MSG_LOGACK:
+ case MSG_CONFIG:
+ break;
+ case CEPH_MSG_PING:
+ m->put();
+ return true;
+ default:
+ return false;
+ }
+
+ std::lock_guard lock(monc_lock);
+
+ if (!m->get_connection()->is_anon() &&
+ m->get_source().type() == CEPH_ENTITY_TYPE_MON) {
+ if (_hunting()) {
+ auto p = _find_pending_con(m->get_connection());
+ if (p == pending_cons.end()) {
+ // ignore any messages outside hunting sessions
+ ldout(cct, 10) << "discarding stray monitor message " << *m << dendl;
+ m->put();
+ return true;
+ }
+ } else if (!active_con || active_con->get_con() != m->get_connection()) {
+ // ignore any messages outside our session(s)
+ ldout(cct, 10) << "discarding stray monitor message " << *m << dendl;
+ m->put();
+ return true;
+ }
+ }
+
+ switch (m->get_type()) {
+ case CEPH_MSG_MON_MAP:
+ handle_monmap(static_cast<MMonMap*>(m));
+ if (passthrough_monmap) {
+ return false;
+ } else {
+ m->put();
+ }
+ break;
+ case CEPH_MSG_AUTH_REPLY:
+ handle_auth(static_cast<MAuthReply*>(m));
+ break;
+ case CEPH_MSG_MON_SUBSCRIBE_ACK:
+ handle_subscribe_ack(static_cast<MMonSubscribeAck*>(m));
+ break;
+ case CEPH_MSG_MON_GET_VERSION_REPLY:
+ handle_get_version_reply(static_cast<MMonGetVersionReply*>(m));
+ break;
+ case MSG_MON_COMMAND_ACK:
+ handle_mon_command_ack(static_cast<MMonCommandAck*>(m));
+ break;
+ case MSG_COMMAND_REPLY:
+ if (m->get_connection()->is_anon() &&
+ m->get_source().type() == CEPH_ENTITY_TYPE_MON) {
+ // this connection is from 'tell'... ignore everything except our command
+ // reply. (we'll get misc other message because we authenticated, but we
+ // don't need them.)
+ handle_command_reply(static_cast<MCommandReply*>(m));
+ return true;
+ }
+ // leave the message for another dispatch handler (e.g., Objecter)
+ return false;
+ case MSG_LOGACK:
+ if (log_client) {
+ log_client->handle_log_ack(static_cast<MLogAck*>(m));
+ m->put();
+ if (more_log_pending) {
+ send_log();
+ }
+ } else {
+ m->put();
+ }
+ break;
+ case MSG_CONFIG:
+ handle_config(static_cast<MConfig*>(m));
+ break;
+ }
+ return true;
+}
+
+void MonClient::send_log(bool flush)
+{
+ if (log_client) {
+ auto lm = log_client->get_mon_log_message(flush);
+ if (lm)
+ _send_mon_message(std::move(lm));
+ more_log_pending = log_client->are_pending();
+ }
+}
+
+void MonClient::flush_log()
+{
+ std::lock_guard l(monc_lock);
+ send_log();
+}
+
+/* Unlike all the other message-handling functions, we don't put away a reference
+* because we want to support MMonMap passthrough to other Dispatchers. */
+void MonClient::handle_monmap(MMonMap *m)
+{
+ ldout(cct, 10) << __func__ << " " << *m << dendl;
+ auto con_addrs = m->get_source_addrs();
+ string old_name = monmap.get_name(con_addrs);
+ const auto old_epoch = monmap.get_epoch();
+
+ auto p = m->monmapbl.cbegin();
+ decode(monmap, p);
+
+ ldout(cct, 10) << " got monmap " << monmap.epoch
+ << " from mon." << old_name
+ << " (according to old e" << monmap.get_epoch() << ")"
+ << dendl;
+ ldout(cct, 10) << "dump:\n";
+ monmap.print(*_dout);
+ *_dout << dendl;
+
+ if (old_epoch != monmap.get_epoch()) {
+ tried.clear();
+ }
+ if (old_name.size() == 0) {
+ ldout(cct,10) << " can't identify which mon we were connected to" << dendl;
+ _reopen_session();
+ } else {
+ auto new_name = monmap.get_name(con_addrs);
+ if (new_name.empty()) {
+ ldout(cct, 10) << "mon." << old_name << " at " << con_addrs
+ << " went away" << dendl;
+ // can't find the mon we were talking to (above)
+ _reopen_session();
+ } else if (messenger->should_use_msgr2() &&
+ monmap.get_addrs(new_name).has_msgr2() &&
+ !con_addrs.has_msgr2()) {
+ ldout(cct,1) << " mon." << new_name << " has (v2) addrs "
+ << monmap.get_addrs(new_name) << " but i'm connected to "
+ << con_addrs << ", reconnecting" << dendl;
+ _reopen_session();
+ }
+ }
+
+ cct->set_mon_addrs(monmap);
+
+ sub.got("monmap", monmap.get_epoch());
+ map_cond.notify_all();
+ want_monmap = false;
+
+ if (authenticate_err == 1) {
+ _finish_auth(0);
+ }
+}
+
+void MonClient::handle_config(MConfig *m)
+{
+ ldout(cct,10) << __func__ << " " << *m << dendl;
+
+ if (want_bootstrap_config) {
+ // get_monmap_and_config is waiting for config which it will apply
+ // synchronously
+ bootstrap_config = ceph::ref_t<MConfig>(m, false);
+ map_cond.notify_all();
+ return;
+ }
+
+ // Take the sledgehammer approach to ensuring we don't depend on
+ // anything in MonClient.
+ boost::asio::post(finish_strand,
+ [m, cct = boost::intrusive_ptr<CephContext>(cct),
+ config_notify_cb = config_notify_cb,
+ config_cb = config_cb]() {
+ cct->_conf.set_mon_vals(cct.get(), m->config, config_cb);
+ if (config_notify_cb) {
+ config_notify_cb();
+ }
+ m->put();
+ });
+}
+
+// ----------------------
+
+int MonClient::init()
+{
+ ldout(cct, 10) << __func__ << dendl;
+
+ entity_name = cct->_conf->name;
+
+ auth_registry.refresh_config();
+
+ std::lock_guard l(monc_lock);
+ keyring.reset(new KeyRing);
+ if (auth_registry.is_supported_method(messenger->get_mytype(),
+ CEPH_AUTH_CEPHX)) {
+ // this should succeed, because auth_registry just checked!
+ int r = keyring->from_ceph_context(cct);
+ if (r != 0) {
+ // but be somewhat graceful in case there was a race condition
+ lderr(cct) << "keyring not found" << dendl;
+ return r;
+ }
+ }
+ if (!auth_registry.any_supported_methods(messenger->get_mytype())) {
+ return -ENOENT;
+ }
+
+ rotating_secrets.reset(
+ new RotatingKeyRing(cct, cct->get_module_type(), keyring.get()));
+
+ initialized = true;
+
+ messenger->set_auth_client(this);
+ messenger->add_dispatcher_head(this, Dispatcher::PRIORITY_HIGH);
+
+ timer.init();
+ schedule_tick();
+
+ cct->get_admin_socket()->register_command(
+ "rotate-key",
+ this,
+ "rotate live authentication key");
+
+ return 0;
+}
+
+void MonClient::shutdown()
+{
+ ldout(cct, 10) << __func__ << dendl;
+
+ cct->get_admin_socket()->unregister_commands(this);
+
+ monc_lock.lock();
+ stopping = true;
+ while (!version_requests.empty()) {
+ ceph::async::post(std::move(version_requests.begin()->second),
+ monc_errc::shutting_down, 0, 0);
+ ldout(cct, 20) << __func__ << " canceling and discarding version request "
+ << version_requests.begin()->first << dendl;
+ version_requests.erase(version_requests.begin());
+ }
+ while (!mon_commands.empty()) {
+ auto tid = mon_commands.begin()->first;
+ _cancel_mon_command(tid);
+ }
+ ldout(cct, 20) << __func__ << " discarding " << waiting_for_session.size()
+ << " pending message(s)" << dendl;
+ waiting_for_session.clear();
+
+ active_con.reset();
+ pending_cons.clear();
+
+ auth.reset();
+ global_id = 0;
+ authenticate_err = 0;
+ authenticated = false;
+
+ monc_lock.unlock();
+
+ if (initialized) {
+ initialized = false;
+ }
+ monc_lock.lock();
+ timer.shutdown();
+ stopping = false;
+ monc_lock.unlock();
+}
+
+int MonClient::authenticate(double timeout)
+{
+ std::unique_lock lock{monc_lock};
+
+ if (active_con) {
+ ldout(cct, 5) << "already authenticated" << dendl;
+ return 0;
+ }
+ sub.want("monmap", monmap.get_epoch() ? monmap.get_epoch() + 1 : 0, 0);
+ sub.want("config", 0, 0);
+ if (!_opened())
+ _reopen_session();
+
+ auto until = ceph::mono_clock::now();
+ until += ceph::make_timespan(timeout);
+ if (timeout > 0.0)
+ ldout(cct, 10) << "authenticate will time out at " << until << dendl;
+ while (!active_con && authenticate_err >= 0) {
+ if (timeout > 0.0) {
+ auto r = auth_cond.wait_until(lock, until);
+ if (r == std::cv_status::timeout && !active_con) {
+ ldout(cct, 0) << "authenticate timed out after " << timeout << dendl;
+ authenticate_err = -ETIMEDOUT;
+ }
+ } else {
+ auth_cond.wait(lock);
+ }
+ }
+
+ if (active_con) {
+ ldout(cct, 5) << __func__ << " success, global_id "
+ << active_con->get_global_id() << dendl;
+ // active_con should not have been set if there was an error
+ ceph_assert(authenticate_err >= 0);
+ authenticated = true;
+ }
+
+ if (authenticate_err < 0 && auth_registry.no_keyring_disabled_cephx()) {
+ lderr(cct) << __func__ << " NOTE: no keyring found; disabled cephx authentication" << dendl;
+ }
+
+ return authenticate_err;
+}
+
+int MonClient::call(
+ std::string_view command,
+ const cmdmap_t& cmdmap,
+ const ceph::buffer::list &inbl,
+ ceph::Formatter *f,
+ std::ostream& errss,
+ ceph::buffer::list& out)
+{
+ if (command == "rotate-key") {
+ CryptoKey key;
+ try {
+ key.decode_base64(inbl.to_str());
+ } catch (buffer::error& e) {
+ errss << "error decoding key: " << e.what();
+ return -EINVAL;
+ }
+ if (keyring) {
+ ldout(cct, 1) << "rotate live key for " << entity_name << dendl;
+ keyring->add(entity_name, key);
+ } else {
+ errss << "cephx not enabled; no key to rotate";
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+void MonClient::handle_auth(MAuthReply *m)
+{
+ ceph_assert(ceph_mutex_is_locked(monc_lock));
+
+ if (m->get_connection()->is_anon()) {
+ // anon connection, used for mon tell commands
+ for (auto& p : mon_commands) {
+ if (p.second->target_con == m->get_connection()) {
+ auto& mc = p.second->target_session;
+ int ret = mc->handle_auth(m, entity_name,
+ CEPH_ENTITY_TYPE_MON,
+ rotating_secrets.get());
+ (void)ret; // we don't care
+ break;
+ }
+ }
+ m->put();
+ return;
+ }
+
+ if (!_hunting()) {
+ std::swap(active_con->get_auth(), auth);
+ int ret = active_con->authenticate(m);
+ m->put();
+ std::swap(auth, active_con->get_auth());
+ if (global_id != active_con->get_global_id()) {
+ lderr(cct) << __func__ << " peer assigned me a different global_id: "
+ << active_con->get_global_id() << dendl;
+ }
+ if (ret != -EAGAIN) {
+ _finish_auth(ret);
+ }
+ return;
+ }
+
+ // hunting
+ auto found = _find_pending_con(m->get_connection());
+ ceph_assert(found != pending_cons.end());
+ int auth_err = found->second.handle_auth(m, entity_name, want_keys,
+ rotating_secrets.get());
+ m->put();
+ if (auth_err == -EAGAIN) {
+ return;
+ }
+ if (auth_err) {
+ pending_cons.erase(found);
+ if (!pending_cons.empty()) {
+ // keep trying with pending connections
+ return;
+ }
+ // the last try just failed, give up.
+ } else {
+ auto& mc = found->second;
+ ceph_assert(mc.have_session());
+ active_con.reset(new MonConnection(std::move(mc)));
+ pending_cons.clear();
+ }
+
+ _finish_hunting(auth_err);
+ _finish_auth(auth_err);
+}
+
+void MonClient::_finish_auth(int auth_err)
+{
+ ldout(cct,10) << __func__ << " " << auth_err << dendl;
+ authenticate_err = auth_err;
+ // _resend_mon_commands() could _reopen_session() if the connected mon is not
+ // the one the MonCommand is targeting.
+ if (!auth_err && active_con) {
+ ceph_assert(auth);
+ _check_auth_tickets();
+ } else if (auth_err == -EAGAIN && !active_con) {
+ ldout(cct,10) << __func__
+ << " auth returned EAGAIN, reopening the session to try again"
+ << dendl;
+ _reopen_session();
+ }
+ auth_cond.notify_all();
+}
+
+// ---------
+
+void MonClient::send_mon_message(MessageRef m)
+{
+ std::lock_guard l{monc_lock};
+ _send_mon_message(std::move(m));
+}
+
+void MonClient::_send_mon_message(MessageRef m)
+{
+ ceph_assert(ceph_mutex_is_locked(monc_lock));
+ if (active_con) {
+ auto cur_con = active_con->get_con();
+ ldout(cct, 10) << "_send_mon_message to mon."
+ << monmap.get_name(cur_con->get_peer_addr())
+ << " at " << cur_con->get_peer_addr() << dendl;
+ cur_con->send_message2(std::move(m));
+ } else {
+ waiting_for_session.push_back(std::move(m));
+ }
+}
+
+void MonClient::_reopen_session(int rank)
+{
+ ceph_assert(ceph_mutex_is_locked(monc_lock));
+ ldout(cct, 10) << __func__ << " rank " << rank << dendl;
+
+ active_con.reset();
+ pending_cons.clear();
+
+ authenticate_err = 1; // == in progress
+
+ _start_hunting();
+
+ if (rank == -1) {
+ rank = cct->_conf.get_val<int64_t>("mon_client_target_rank");
+ }
+
+ if (rank >= 0) {
+ _add_conn(rank);
+ } else {
+ _add_conns();
+ }
+
+ // throw out old queued messages
+ waiting_for_session.clear();
+
+ // throw out version check requests
+ while (!version_requests.empty()) {
+ ceph::async::post(std::move(version_requests.begin()->second),
+ monc_errc::session_reset, 0, 0);
+ version_requests.erase(version_requests.begin());
+ }
+
+ for (auto& c : pending_cons) {
+ c.second.start(monmap.get_epoch(), entity_name);
+ }
+
+ if (sub.reload()) {
+ _renew_subs();
+ }
+}
+
+void MonClient::_add_conn(unsigned rank)
+{
+ auto peer = monmap.get_addrs(rank);
+ auto conn = messenger->connect_to_mon(peer);
+ MonConnection mc(cct, conn, global_id, &auth_registry);
+ if (auth) {
+ mc.get_auth().reset(auth->clone());
+ }
+ pending_cons.insert(std::make_pair(peer, std::move(mc)));
+ ldout(cct, 10) << "picked mon." << monmap.get_name(rank)
+ << " con " << conn
+ << " addr " << peer
+ << dendl;
+}
+
+void MonClient::_add_conns()
+{
+ // collect the next batch of candidates who are listed right next to the ones
+ // already tried
+ auto get_next_batch = [this]() -> std::vector<unsigned> {
+ std::multimap<uint16_t, unsigned> ranks_by_priority;
+ boost::copy(
+ monmap.mon_info | boost::adaptors::filtered(
+ [this](auto& info) {
+ auto rank = monmap.get_rank(info.first);
+ return tried.count(rank) == 0;
+ }) | boost::adaptors::transformed(
+ [this](auto& info) {
+ auto rank = monmap.get_rank(info.first);
+ return std::make_pair(info.second.priority, rank);
+ }), std::inserter(ranks_by_priority, end(ranks_by_priority)));
+ if (ranks_by_priority.empty()) {
+ return {};
+ }
+ // only choose the monitors with lowest priority
+ auto cands = boost::make_iterator_range(
+ ranks_by_priority.equal_range(ranks_by_priority.begin()->first));
+ std::vector<unsigned> ranks;
+ boost::range::copy(cands | boost::adaptors::map_values,
+ std::back_inserter(ranks));
+ return ranks;
+ };
+ auto ranks = get_next_batch();
+ if (ranks.empty()) {
+ tried.clear(); // start over
+ ranks = get_next_batch();
+ }
+ ceph_assert(!ranks.empty());
+ if (ranks.size() > 1) {
+ std::vector<uint16_t> weights;
+ for (auto i : ranks) {
+ auto rank_name = monmap.get_name(i);
+ weights.push_back(monmap.get_weight(rank_name));
+ }
+ random_device_t rd;
+ if (std::accumulate(begin(weights), end(weights), 0u) == 0) {
+ std::shuffle(begin(ranks), end(ranks), std::mt19937{rd()});
+ } else {
+ weighted_shuffle(begin(ranks), end(ranks), begin(weights), end(weights),
+ std::mt19937{rd()});
+ }
+ }
+ ldout(cct, 10) << __func__ << " ranks=" << ranks << dendl;
+ unsigned n = cct->_conf->mon_client_hunt_parallel;
+ if (n == 0 || n > ranks.size()) {
+ n = ranks.size();
+ }
+ for (unsigned i = 0; i < n; i++) {
+ _add_conn(ranks[i]);
+ tried.insert(ranks[i]);
+ }
+}
+
+bool MonClient::ms_handle_reset(Connection *con)
+{
+ std::lock_guard lock(monc_lock);
+
+ if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON)
+ return false;
+
+ if (con->is_anon()) {
+ auto p = mon_commands.begin();
+ while (p != mon_commands.end()) {
+ auto cmd = p->second;
+ ++p;
+ if (cmd->target_con == con) {
+ _send_command(cmd); // may retry or fail
+ break;
+ }
+ }
+ return true;
+ }
+
+ if (_hunting()) {
+ if (pending_cons.count(con->get_peer_addrs())) {
+ ldout(cct, 10) << __func__ << " hunted mon " << con->get_peer_addrs()
+ << dendl;
+ } else {
+ ldout(cct, 10) << __func__ << " stray mon " << con->get_peer_addrs()
+ << dendl;
+ }
+ return true;
+ } else {
+ if (active_con && con == active_con->get_con()) {
+ ldout(cct, 10) << __func__ << " current mon " << con->get_peer_addrs()
+ << dendl;
+ _reopen_session();
+ return false;
+ } else {
+ ldout(cct, 10) << "ms_handle_reset stray mon " << con->get_peer_addrs()
+ << dendl;
+ return true;
+ }
+ }
+}
+
+bool MonClient::_opened() const
+{
+ ceph_assert(ceph_mutex_is_locked(monc_lock));
+ return active_con || _hunting();
+}
+
+bool MonClient::_hunting() const
+{
+ return !pending_cons.empty();
+}
+
+void MonClient::_start_hunting()
+{
+ ceph_assert(!_hunting());
+ // adjust timeouts if necessary
+ if (!had_a_connection)
+ return;
+ reopen_interval_multiplier *= cct->_conf->mon_client_hunt_interval_backoff;
+ if (reopen_interval_multiplier >
+ cct->_conf->mon_client_hunt_interval_max_multiple) {
+ reopen_interval_multiplier =
+ cct->_conf->mon_client_hunt_interval_max_multiple;
+ }
+}
+
+void MonClient::_finish_hunting(int auth_err)
+{
+ ldout(cct,10) << __func__ << " " << auth_err << dendl;
+ ceph_assert(ceph_mutex_is_locked(monc_lock));
+ // the pending conns have been cleaned.
+ ceph_assert(!_hunting());
+ if (active_con) {
+ auto con = active_con->get_con();
+ ldout(cct, 1) << "found mon."
+ << monmap.get_name(con->get_peer_addr())
+ << dendl;
+ } else {
+ ldout(cct, 1) << "no mon sessions established" << dendl;
+ }
+
+ had_a_connection = true;
+ _un_backoff();
+
+ if (!auth_err) {
+ last_rotating_renew_sent = utime_t();
+ while (!waiting_for_session.empty()) {
+ _send_mon_message(std::move(waiting_for_session.front()));
+ waiting_for_session.pop_front();
+ }
+ _resend_mon_commands();
+ send_log(true);
+ if (active_con) {
+ auth = std::move(active_con->get_auth());
+ if (global_id && global_id != active_con->get_global_id()) {
+ lderr(cct) << __func__ << " global_id changed from " << global_id
+ << " to " << active_con->get_global_id() << dendl;
+ }
+ global_id = active_con->get_global_id();
+ }
+ }
+}
+
+void MonClient::tick()
+{
+ ldout(cct, 10) << __func__ << dendl;
+
+ utime_t now = ceph_clock_now();
+
+ auto reschedule_tick = make_scope_guard([this] {
+ schedule_tick();
+ });
+
+ _check_auth_tickets();
+ _check_tell_commands();
+
+ if (_hunting()) {
+ ldout(cct, 1) << "continuing hunt" << dendl;
+ return _reopen_session();
+ } else if (active_con) {
+ // just renew as needed
+ auto cur_con = active_con->get_con();
+ if (!cur_con->has_feature(CEPH_FEATURE_MON_STATEFUL_SUB)) {
+ const bool maybe_renew = sub.need_renew();
+ ldout(cct, 10) << "renew subs? -- " << (maybe_renew ? "yes" : "no")
+ << dendl;
+ if (maybe_renew) {
+ _renew_subs();
+ }
+ }
+
+ if (now > last_keepalive + cct->_conf->mon_client_ping_interval) {
+ cur_con->send_keepalive();
+ last_keepalive = now;
+
+ if (cct->_conf->mon_client_ping_timeout > 0 &&
+ cur_con->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) {
+ utime_t lk = cur_con->get_last_keepalive_ack();
+ utime_t interval = now - lk;
+ if (interval > cct->_conf->mon_client_ping_timeout) {
+ ldout(cct, 1) << "no keepalive since " << lk << " (" << interval
+ << " seconds), reconnecting" << dendl;
+ return _reopen_session();
+ }
+ }
+
+ _un_backoff();
+ }
+
+ if (now > last_send_log + cct->_conf->mon_client_log_interval) {
+ send_log();
+ last_send_log = now;
+ }
+ }
+}
+
+void MonClient::_un_backoff()
+{
+ // un-backoff our reconnect interval
+ reopen_interval_multiplier = std::max(
+ cct->_conf.get_val<double>("mon_client_hunt_interval_min_multiple"),
+ reopen_interval_multiplier /
+ cct->_conf.get_val<double>("mon_client_hunt_interval_backoff"));
+ ldout(cct, 20) << __func__ << " reopen_interval_multipler now "
+ << reopen_interval_multiplier << dendl;
+}
+
+void MonClient::schedule_tick()
+{
+ auto do_tick = make_lambda_context([this](int) { tick(); });
+ if (!is_connected()) {
+ // start another round of hunting
+ const auto hunt_interval = (cct->_conf->mon_client_hunt_interval *
+ reopen_interval_multiplier);
+ timer.add_event_after(hunt_interval, do_tick);
+ } else {
+ // keep in touch
+ timer.add_event_after(std::min(cct->_conf->mon_client_ping_interval,
+ cct->_conf->mon_client_log_interval),
+ do_tick);
+ }
+}
+
+// ---------
+
+void MonClient::_renew_subs()
+{
+ ceph_assert(ceph_mutex_is_locked(monc_lock));
+ if (!sub.have_new()) {
+ ldout(cct, 10) << __func__ << " - empty" << dendl;
+ return;
+ }
+
+ ldout(cct, 10) << __func__ << dendl;
+ if (!_opened())
+ _reopen_session();
+ else {
+ auto m = ceph::make_message<MMonSubscribe>();
+ m->what = sub.get_subs();
+ m->hostname = ceph_get_short_hostname();
+ _send_mon_message(std::move(m));
+ sub.renewed();
+ }
+}
+
+void MonClient::handle_subscribe_ack(MMonSubscribeAck *m)
+{
+ sub.acked(m->interval);
+ m->put();
+}
+
+int MonClient::_check_auth_tickets()
+{
+ ldout(cct, 10) << __func__ << dendl;
+ ceph_assert(ceph_mutex_is_locked(monc_lock));
+ if (active_con && auth) {
+ if (auth->need_tickets()) {
+ ldout(cct, 10) << __func__ << " getting new tickets!" << dendl;
+ auto m = ceph::make_message<MAuth>();
+ m->protocol = auth->get_protocol();
+ auth->prepare_build_request();
+ auth->build_request(m->auth_payload);
+ _send_mon_message(m);
+ }
+
+ _check_auth_rotating();
+ }
+ return 0;
+}
+
+int MonClient::_check_auth_rotating()
+{
+ ceph_assert(ceph_mutex_is_locked(monc_lock));
+ if (!rotating_secrets ||
+ !auth_principal_needs_rotating_keys(entity_name)) {
+ ldout(cct, 20) << "_check_auth_rotating not needed by " << entity_name << dendl;
+ return 0;
+ }
+
+ if (!active_con || !auth) {
+ ldout(cct, 10) << "_check_auth_rotating waiting for auth session" << dendl;
+ return 0;
+ }
+
+ utime_t now = ceph_clock_now();
+ utime_t cutoff = now;
+ cutoff -= std::min(30.0, cct->_conf->auth_service_ticket_ttl / 4.0);
+ utime_t issued_at_lower_bound = now;
+ issued_at_lower_bound -= cct->_conf->auth_service_ticket_ttl;
+ if (!rotating_secrets->need_new_secrets(cutoff)) {
+ ldout(cct, 10) << "_check_auth_rotating have uptodate secrets (they expire after " << cutoff << ")" << dendl;
+ rotating_secrets->dump_rotating();
+ return 0;
+ }
+
+ ldout(cct, 10) << "_check_auth_rotating renewing rotating keys (they expired before " << cutoff << ")" << dendl;
+ if (!rotating_secrets->need_new_secrets() &&
+ rotating_secrets->need_new_secrets(issued_at_lower_bound)) {
+ // the key has expired before it has been issued?
+ lderr(cct) << __func__ << " possible clock skew, rotating keys expired way too early"
+ << " (before " << issued_at_lower_bound << ")" << dendl;
+ }
+ if ((now > last_rotating_renew_sent) &&
+ double(now - last_rotating_renew_sent) < 1) {
+ ldout(cct, 10) << __func__ << " called too often (last: "
+ << last_rotating_renew_sent << "), skipping refresh" << dendl;
+ return 0;
+ }
+ auto m = ceph::make_message<MAuth>();
+ m->protocol = auth->get_protocol();
+ if (auth->build_rotating_request(m->auth_payload)) {
+ last_rotating_renew_sent = now;
+ _send_mon_message(std::move(m));
+ }
+ return 0;
+}
+
+int MonClient::wait_auth_rotating(double timeout)
+{
+ std::unique_lock l(monc_lock);
+
+ // Must be initialized
+ ceph_assert(auth != nullptr);
+
+ if (auth->get_protocol() == CEPH_AUTH_NONE)
+ return 0;
+
+ if (!rotating_secrets)
+ return 0;
+
+ ldout(cct, 10) << __func__ << " waiting for " << timeout << dendl;
+ utime_t cutoff = ceph_clock_now();
+ cutoff -= std::min(30.0, cct->_conf->auth_service_ticket_ttl / 4.0);
+ if (auth_cond.wait_for(l, ceph::make_timespan(timeout), [this, cutoff] {
+ return (!auth_principal_needs_rotating_keys(entity_name) ||
+ !rotating_secrets->need_new_secrets(cutoff));
+ })) {
+ ldout(cct, 10) << __func__ << " done" << dendl;
+ return 0;
+ } else {
+ ldout(cct, 0) << __func__ << " timed out after " << timeout << dendl;
+ return -ETIMEDOUT;
+ }
+}
+
+// ---------
+
+void MonClient::_send_command(MonCommand *r)
+{
+ if (r->is_tell()) {
+ ++r->send_attempts;
+ if (r->send_attempts > cct->_conf->mon_client_directed_command_retry) {
+ _finish_command(r, monc_errc::mon_unavailable, "mon unavailable", {});
+ return;
+ }
+ // tell-style command
+ if (monmap.min_mon_release >= ceph_release_t::octopus) {
+ if (r->target_con) {
+ r->target_con->mark_down();
+ }
+ if (r->target_rank >= 0) {
+ if (r->target_rank >= (int)monmap.size()) {
+ ldout(cct, 10) << " target " << r->target_rank
+ << " >= max mon " << monmap.size() << dendl;
+ _finish_command(r, monc_errc::rank_dne, "mon rank dne"sv, {});
+ return;
+ }
+ r->target_con = messenger->connect_to_mon(
+ monmap.get_addrs(r->target_rank), true /* anon */);
+ } else {
+ if (!monmap.contains(r->target_name)) {
+ ldout(cct, 10) << " target " << r->target_name
+ << " not present in monmap" << dendl;
+ _finish_command(r, monc_errc::mon_dne, "mon dne"sv, {});
+ return;
+ }
+ r->target_con = messenger->connect_to_mon(
+ monmap.get_addrs(r->target_name), true /* anon */);
+ }
+
+ r->target_session.reset(new MonConnection(cct, r->target_con, 0,
+ &auth_registry));
+ r->target_session->start(monmap.get_epoch(), entity_name);
+ r->last_send_attempt = ceph_clock_now();
+
+ MCommand *m = new MCommand(monmap.fsid);
+ m->set_tid(r->tid);
+ m->cmd = r->cmd;
+ m->set_data(r->inbl);
+ r->target_session->queue_command(m);
+ return;
+ }
+
+ // ugly legacy handling of pre-octopus mons
+ entity_addr_t peer;
+ if (active_con) {
+ peer = active_con->get_con()->get_peer_addr();
+ }
+
+ if (r->target_rank >= 0 &&
+ r->target_rank != monmap.get_rank(peer)) {
+ ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd
+ << " wants rank " << r->target_rank
+ << ", reopening session"
+ << dendl;
+ if (r->target_rank >= (int)monmap.size()) {
+ ldout(cct, 10) << " target " << r->target_rank
+ << " >= max mon " << monmap.size() << dendl;
+ _finish_command(r, monc_errc::rank_dne, "mon rank dne"sv, {});
+ return;
+ }
+ _reopen_session(r->target_rank);
+ return;
+ }
+ if (r->target_name.length() &&
+ r->target_name != monmap.get_name(peer)) {
+ ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd
+ << " wants mon " << r->target_name
+ << ", reopening session"
+ << dendl;
+ if (!monmap.contains(r->target_name)) {
+ ldout(cct, 10) << " target " << r->target_name
+ << " not present in monmap" << dendl;
+ _finish_command(r, monc_errc::mon_dne, "mon dne"sv, {});
+ return;
+ }
+ _reopen_session(monmap.get_rank(r->target_name));
+ return;
+ }
+ // fall-thru to send 'normal' CLI command
+ }
+
+ // normal CLI command
+ ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd << dendl;
+ auto m = ceph::make_message<MMonCommand>(monmap.fsid);
+ m->set_tid(r->tid);
+ m->cmd = r->cmd;
+ m->set_data(r->inbl);
+ _send_mon_message(std::move(m));
+ return;
+}
+
+void MonClient::_check_tell_commands()
+{
+ // resend any requests
+ auto now = ceph_clock_now();
+ auto p = mon_commands.begin();
+ while (p != mon_commands.end()) {
+ auto cmd = p->second;
+ ++p;
+ if (cmd->is_tell() &&
+ cmd->last_send_attempt != utime_t() &&
+ now - cmd->last_send_attempt > cct->_conf->mon_client_hunt_interval) {
+ ldout(cct,5) << __func__ << " timeout tell command " << cmd->tid << dendl;
+ _send_command(cmd); // might remove cmd from mon_commands
+ }
+ }
+}
+
+void MonClient::_resend_mon_commands()
+{
+ // resend any requests
+ auto p = mon_commands.begin();
+ while (p != mon_commands.end()) {
+ auto cmd = p->second;
+ ++p;
+ if (cmd->is_tell() && monmap.min_mon_release >= ceph_release_t::octopus) {
+ // starting with octopus, tell commands use their own connetion and need no
+ // special resend when we finish hunting.
+ } else {
+ _send_command(cmd); // might remove cmd from mon_commands
+ }
+ }
+}
+
+void MonClient::handle_mon_command_ack(MMonCommandAck *ack)
+{
+ MonCommand *r = NULL;
+ uint64_t tid = ack->get_tid();
+
+ if (tid == 0 && !mon_commands.empty()) {
+ r = mon_commands.begin()->second;
+ ldout(cct, 10) << __func__ << " has tid 0, assuming it is " << r->tid << dendl;
+ } else {
+ auto p = mon_commands.find(tid);
+ if (p == mon_commands.end()) {
+ ldout(cct, 10) << __func__ << " " << ack->get_tid() << " not found" << dendl;
+ ack->put();
+ return;
+ }
+ r = p->second;
+ }
+
+ ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd << dendl;
+ auto ec = ack->r < 0 ? bs::error_code(-ack->r, mon_category())
+ : bs::error_code();
+ _finish_command(r, ec, ack->rs,
+ std::move(ack->get_data()));
+ ack->put();
+}
+
+void MonClient::handle_command_reply(MCommandReply *reply)
+{
+ MonCommand *r = NULL;
+ uint64_t tid = reply->get_tid();
+
+ if (tid == 0 && !mon_commands.empty()) {
+ r = mon_commands.begin()->second;
+ ldout(cct, 10) << __func__ << " has tid 0, assuming it is " << r->tid
+ << dendl;
+ } else {
+ auto p = mon_commands.find(tid);
+ if (p == mon_commands.end()) {
+ ldout(cct, 10) << __func__ << " " << reply->get_tid() << " not found"
+ << dendl;
+ reply->put();
+ return;
+ }
+ r = p->second;
+ }
+
+ ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd << dendl;
+ auto ec = reply->r < 0 ? bs::error_code(-reply->r, mon_category())
+ : bs::error_code();
+ _finish_command(r, ec, reply->rs, std::move(reply->get_data()));
+ reply->put();
+}
+
+int MonClient::_cancel_mon_command(uint64_t tid)
+{
+ ceph_assert(ceph_mutex_is_locked(monc_lock));
+
+ auto it = mon_commands.find(tid);
+ if (it == mon_commands.end()) {
+ ldout(cct, 10) << __func__ << " tid " << tid << " dne" << dendl;
+ return -ENOENT;
+ }
+
+ ldout(cct, 10) << __func__ << " tid " << tid << dendl;
+
+ MonCommand *cmd = it->second;
+ _finish_command(cmd, monc_errc::timed_out, "timed out"sv, {});
+ return 0;
+}
+
+void MonClient::_finish_command(MonCommand *r, bs::error_code ret,
+ std::string_view rs, ceph::buffer::list&& bl)
+{
+ ldout(cct, 10) << __func__ << " " << r->tid << " = " << ret << " " << rs
+ << dendl;
+ ceph::async::post(std::move(r->onfinish), ret, std::string(rs),
+ std::move(bl));
+ if (r->target_con) {
+ r->target_con->mark_down();
+ }
+ mon_commands.erase(r->tid);
+ delete r;
+}
+
+// ---------
+
+void MonClient::handle_get_version_reply(MMonGetVersionReply* m)
+{
+ ceph_assert(ceph_mutex_is_locked(monc_lock));
+ auto iter = version_requests.find(m->handle);
+ if (iter == version_requests.end()) {
+ ldout(cct, 0) << __func__ << " version request with handle " << m->handle
+ << " not found" << dendl;
+ } else {
+ auto req = std::move(iter->second);
+ ldout(cct, 10) << __func__ << " finishing " << iter->first << " version "
+ << m->version << dendl;
+ version_requests.erase(iter);
+ ceph::async::post(std::move(req), bs::error_code(),
+ m->version, m->oldest_version);
+ }
+ m->put();
+}
+
+int MonClient::get_auth_request(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ uint32_t *auth_method,
+ std::vector<uint32_t> *preferred_modes,
+ ceph::buffer::list *bl)
+{
+ std::lock_guard l(monc_lock);
+ ldout(cct,10) << __func__ << " con " << con << " auth_method " << *auth_method
+ << dendl;
+
+ // connection to mon?
+ if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+ ceph_assert(!auth_meta->authorizer);
+ if (con->is_anon()) {
+ for (auto& i : mon_commands) {
+ if (i.second->target_con == con) {
+ return i.second->target_session->get_auth_request(
+ auth_method, preferred_modes, bl,
+ entity_name, want_keys, rotating_secrets.get());
+ }
+ }
+ }
+ for (auto& i : pending_cons) {
+ if (i.second.is_con(con)) {
+ return i.second.get_auth_request(
+ auth_method, preferred_modes, bl,
+ entity_name, want_keys, rotating_secrets.get());
+ }
+ }
+ return -ENOENT;
+ }
+
+ // generate authorizer
+ if (!auth) {
+ lderr(cct) << __func__ << " but no auth handler is set up" << dendl;
+ return -EACCES;
+ }
+ auth_meta->authorizer.reset(auth->build_authorizer(con->get_peer_type()));
+ if (!auth_meta->authorizer) {
+ lderr(cct) << __func__ << " failed to build_authorizer for type "
+ << ceph_entity_type_name(con->get_peer_type()) << dendl;
+ return -EACCES;
+ }
+ auth_meta->auth_method = auth_meta->authorizer->protocol;
+ auth_registry.get_supported_modes(con->get_peer_type(),
+ auth_meta->auth_method,
+ preferred_modes);
+ *bl = auth_meta->authorizer->bl;
+ return 0;
+}
+
+int MonClient::handle_auth_reply_more(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ const ceph::buffer::list& bl,
+ ceph::buffer::list *reply)
+{
+ std::lock_guard l(monc_lock);
+
+ if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+ if (con->is_anon()) {
+ for (auto& i : mon_commands) {
+ if (i.second->target_con == con) {
+ return i.second->target_session->handle_auth_reply_more(
+ auth_meta, bl, reply);
+ }
+ }
+ }
+ for (auto& i : pending_cons) {
+ if (i.second.is_con(con)) {
+ return i.second.handle_auth_reply_more(auth_meta, bl, reply);
+ }
+ }
+ return -ENOENT;
+ }
+
+ // authorizer challenges
+ if (!auth || !auth_meta->authorizer) {
+ lderr(cct) << __func__ << " no authorizer?" << dendl;
+ return -1;
+ }
+ auth_meta->authorizer->add_challenge(cct, bl);
+ *reply = auth_meta->authorizer->bl;
+ return 0;
+}
+
+int MonClient::handle_auth_done(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ uint64_t global_id,
+ uint32_t con_mode,
+ const ceph::buffer::list& bl,
+ CryptoKey *session_key,
+ std::string *connection_secret)
+{
+ if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+ std::lock_guard l(monc_lock);
+ if (con->is_anon()) {
+ for (auto& i : mon_commands) {
+ if (i.second->target_con == con) {
+ return i.second->target_session->handle_auth_done(
+ auth_meta, global_id, bl,
+ session_key, connection_secret);
+ }
+ }
+ }
+ for (auto& i : pending_cons) {
+ if (i.second.is_con(con)) {
+ int r = i.second.handle_auth_done(
+ auth_meta, global_id, bl,
+ session_key, connection_secret);
+ if (r) {
+ pending_cons.erase(i.first);
+ if (!pending_cons.empty()) {
+ return r;
+ }
+ } else {
+ active_con.reset(new MonConnection(std::move(i.second)));
+ pending_cons.clear();
+ ceph_assert(active_con->have_session());
+ }
+
+ _finish_hunting(r);
+ if (r || monmap.get_epoch() > 0) {
+ _finish_auth(r);
+ }
+ return r;
+ }
+ }
+ return -ENOENT;
+ } else {
+ // verify authorizer reply
+ auto p = bl.begin();
+ if (!auth_meta->authorizer->verify_reply(p, &auth_meta->connection_secret)) {
+ ldout(cct, 0) << __func__ << " failed verifying authorizer reply"
+ << dendl;
+ return -EACCES;
+ }
+ auth_meta->session_key = auth_meta->authorizer->session_key;
+ return 0;
+ }
+}
+
+int MonClient::handle_auth_bad_method(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ uint32_t old_auth_method,
+ int result,
+ const std::vector<uint32_t>& allowed_methods,
+ const std::vector<uint32_t>& allowed_modes)
+{
+ auth_meta->allowed_methods = allowed_methods;
+
+ std::lock_guard l(monc_lock);
+ if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+ if (con->is_anon()) {
+ for (auto& i : mon_commands) {
+ if (i.second->target_con == con) {
+ int r = i.second->target_session->handle_auth_bad_method(
+ old_auth_method,
+ result,
+ allowed_methods,
+ allowed_modes);
+ if (r < 0) {
+ auto ec = bs::error_code(-r, mon_category());
+ _finish_command(i.second, ec, "auth failed"sv, {});
+ }
+ return r;
+ }
+ }
+ }
+ for (auto& i : pending_cons) {
+ if (i.second.is_con(con)) {
+ int r = i.second.handle_auth_bad_method(old_auth_method,
+ result,
+ allowed_methods,
+ allowed_modes);
+ if (r == 0) {
+ return r; // try another method on this con
+ }
+ pending_cons.erase(i.first);
+ if (!pending_cons.empty()) {
+ return r; // fail this con, maybe another con will succeed
+ }
+ // fail hunt
+ _finish_hunting(r);
+ _finish_auth(r);
+ return r;
+ }
+ }
+ return -ENOENT;
+ } else {
+ // huh...
+ ldout(cct,10) << __func__ << " hmm, they didn't like " << old_auth_method
+ << " result " << cpp_strerror(result)
+ << " and auth is " << (auth ? auth->get_protocol() : 0)
+ << dendl;
+ return -EACCES;
+ }
+}
+
+int MonClient::handle_auth_request(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ bool more,
+ uint32_t auth_method,
+ const ceph::buffer::list& payload,
+ ceph::buffer::list *reply)
+{
+ if (payload.length() == 0) {
+ // for some channels prior to nautilus (osd heartbeat), we
+ // tolerate the lack of an authorizer.
+ if (!con->get_messenger()->require_authorizer) {
+ handle_authentication_dispatcher->ms_handle_fast_authentication(con);
+ return 1;
+ }
+ return -EACCES;
+ }
+ auth_meta->auth_mode = payload[0];
+ if (auth_meta->auth_mode < AUTH_MODE_AUTHORIZER ||
+ auth_meta->auth_mode > AUTH_MODE_AUTHORIZER_MAX) {
+ return -EACCES;
+ }
+ AuthAuthorizeHandler *ah = get_auth_authorize_handler(con->get_peer_type(),
+ auth_method);
+ if (!ah) {
+ lderr(cct) << __func__ << " no AuthAuthorizeHandler found for auth method "
+ << auth_method << dendl;
+ return -EOPNOTSUPP;
+ }
+
+ auto ac = &auth_meta->authorizer_challenge;
+ if (auth_meta->skip_authorizer_challenge) {
+ ldout(cct, 10) << __func__ << " skipping challenge on " << con << dendl;
+ ac = nullptr;
+ }
+
+ bool was_challenge = (bool)auth_meta->authorizer_challenge;
+ bool isvalid = ah->verify_authorizer(
+ cct,
+ *rotating_secrets,
+ payload,
+ auth_meta->get_connection_secret_length(),
+ reply,
+ &con->peer_name,
+ &con->peer_global_id,
+ &con->peer_caps_info,
+ &auth_meta->session_key,
+ &auth_meta->connection_secret,
+ ac);
+ if (isvalid) {
+ handle_authentication_dispatcher->ms_handle_fast_authentication(con);
+ return 1;
+ }
+ if (!more && !was_challenge && auth_meta->authorizer_challenge) {
+ ldout(cct,10) << __func__ << " added challenge on " << con << dendl;
+ return 0;
+ }
+ ldout(cct,10) << __func__ << " bad authorizer on " << con << dendl;
+ // discard old challenge
+ auth_meta->authorizer_challenge.reset();
+ return -EACCES;
+}
+
+AuthAuthorizer* MonClient::build_authorizer(int service_id) const {
+ std::lock_guard l(monc_lock);
+ if (auth) {
+ return auth->build_authorizer(service_id);
+ } else {
+ ldout(cct, 0) << __func__ << " for " << ceph_entity_type_name(service_id)
+ << ", but no auth is available now" << dendl;
+ return nullptr;
+ }
+}
+
+#define dout_subsys ceph_subsys_monc
+#undef dout_prefix
+#define dout_prefix *_dout << "monclient" << (have_session() ? ": " : "(hunting): ")
+
+MonConnection::MonConnection(
+ CephContext *cct, ConnectionRef con, uint64_t global_id,
+ AuthRegistry *ar)
+ : cct(cct), con(con), global_id(global_id), auth_registry(ar)
+{}
+
+MonConnection::~MonConnection()
+{
+ if (con) {
+ con->mark_down();
+ con.reset();
+ }
+}
+
+bool MonConnection::have_session() const
+{
+ return state == State::HAVE_SESSION;
+}
+
+void MonConnection::start(epoch_t epoch,
+ const EntityName& entity_name)
+{
+ using ceph::encode;
+ auth_start = ceph_clock_now();
+
+ if (con->get_peer_addr().is_msgr2()) {
+ ldout(cct, 10) << __func__ << " opening mon connection" << dendl;
+ state = State::AUTHENTICATING;
+ con->send_message(new MMonGetMap());
+ return;
+ }
+
+ // restart authentication handshake
+ state = State::NEGOTIATING;
+
+ // send an initial keepalive to ensure our timestamp is valid by the
+ // time we are in an OPENED state (by sequencing this before
+ // authentication).
+ con->send_keepalive();
+
+ auto m = new MAuth;
+ m->protocol = CEPH_AUTH_UNKNOWN;
+ m->monmap_epoch = epoch;
+ __u8 struct_v = 1;
+ encode(struct_v, m->auth_payload);
+ std::vector<uint32_t> auth_supported;
+ auth_registry->get_supported_methods(con->get_peer_type(), &auth_supported);
+ encode(auth_supported, m->auth_payload);
+ encode(entity_name, m->auth_payload);
+ encode(global_id, m->auth_payload);
+ con->send_message(m);
+}
+
+int MonConnection::get_auth_request(
+ uint32_t *method,
+ std::vector<uint32_t> *preferred_modes,
+ ceph::buffer::list *bl,
+ const EntityName& entity_name,
+ uint32_t want_keys,
+ RotatingKeyRing* keyring)
+{
+ using ceph::encode;
+ // choose method
+ if (auth_method < 0) {
+ std::vector<uint32_t> as;
+ auth_registry->get_supported_methods(con->get_peer_type(), &as);
+ if (as.empty()) {
+ return -EACCES;
+ }
+ auth_method = as.front();
+ }
+ *method = auth_method;
+ auth_registry->get_supported_modes(con->get_peer_type(), auth_method,
+ preferred_modes);
+ ldout(cct,10) << __func__ << " method " << *method
+ << " preferred_modes " << *preferred_modes << dendl;
+ if (preferred_modes->empty()) {
+ return -EACCES;
+ }
+
+ int r = _init_auth(*method, entity_name, want_keys, keyring, true);
+ ceph_assert(r == 0);
+
+ // initial requset includes some boilerplate...
+ encode((char)AUTH_MODE_MON, *bl);
+ encode(entity_name, *bl);
+ encode(global_id, *bl);
+
+ // and (maybe) some method-specific initial payload
+ auth->build_initial_request(bl);
+
+ return 0;
+}
+
+int MonConnection::handle_auth_reply_more(
+ AuthConnectionMeta *auth_meta,
+ const ceph::buffer::list& bl,
+ ceph::buffer::list *reply)
+{
+ ldout(cct, 10) << __func__ << " payload " << bl.length() << dendl;
+ ldout(cct, 30) << __func__ << " got\n";
+ bl.hexdump(*_dout);
+ *_dout << dendl;
+
+ auto p = bl.cbegin();
+ ldout(cct, 10) << __func__ << " payload_len " << bl.length() << dendl;
+ int r = auth->handle_response(0, p, &auth_meta->session_key,
+ &auth_meta->connection_secret);
+ if (r == -EAGAIN) {
+ auth->prepare_build_request();
+ auth->build_request(*reply);
+ ldout(cct, 10) << __func__ << " responding with " << reply->length()
+ << " bytes" << dendl;
+ r = 0;
+ } else if (r < 0) {
+ lderr(cct) << __func__ << " handle_response returned " << r << dendl;
+ } else {
+ ldout(cct, 10) << __func__ << " authenticated!" << dendl;
+ // FIXME
+ ceph_abort(cct, "write me");
+ }
+ return r;
+}
+
+int MonConnection::handle_auth_done(
+ AuthConnectionMeta *auth_meta,
+ uint64_t new_global_id,
+ const ceph::buffer::list& bl,
+ CryptoKey *session_key,
+ std::string *connection_secret)
+{
+ ldout(cct,10) << __func__ << " global_id " << new_global_id
+ << " payload " << bl.length()
+ << dendl;
+ global_id = new_global_id;
+ auth->set_global_id(global_id);
+ auto p = bl.begin();
+ int auth_err = auth->handle_response(0, p, &auth_meta->session_key,
+ &auth_meta->connection_secret);
+ if (auth_err >= 0) {
+ state = State::HAVE_SESSION;
+ }
+ con->set_last_keepalive_ack(auth_start);
+
+ if (pending_tell_command) {
+ con->send_message2(std::move(pending_tell_command));
+ }
+ return auth_err;
+}
+
+int MonConnection::handle_auth_bad_method(
+ uint32_t old_auth_method,
+ int result,
+ const std::vector<uint32_t>& allowed_methods,
+ const std::vector<uint32_t>& allowed_modes)
+{
+ ldout(cct,10) << __func__ << " old_auth_method " << old_auth_method
+ << " result " << cpp_strerror(result)
+ << " allowed_methods " << allowed_methods << dendl;
+ std::vector<uint32_t> auth_supported;
+ auth_registry->get_supported_methods(con->get_peer_type(), &auth_supported);
+ auto p = std::find(auth_supported.begin(), auth_supported.end(),
+ old_auth_method);
+ assert(p != auth_supported.end());
+ p = std::find_first_of(std::next(p), auth_supported.end(),
+ allowed_methods.begin(), allowed_methods.end());
+ if (p == auth_supported.end()) {
+ lderr(cct) << __func__ << " server allowed_methods " << allowed_methods
+ << " but i only support " << auth_supported << dendl;
+ return -EACCES;
+ }
+ auth_method = *p;
+ ldout(cct,10) << __func__ << " will try " << auth_method << " next" << dendl;
+ return 0;
+}
+
+int MonConnection::handle_auth(MAuthReply* m,
+ const EntityName& entity_name,
+ uint32_t want_keys,
+ RotatingKeyRing* keyring)
+{
+ if (state == State::NEGOTIATING) {
+ int r = _negotiate(m, entity_name, want_keys, keyring);
+ if (r) {
+ return r;
+ }
+ state = State::AUTHENTICATING;
+ }
+ int r = authenticate(m);
+ if (!r) {
+ state = State::HAVE_SESSION;
+ }
+ return r;
+}
+
+int MonConnection::_negotiate(MAuthReply *m,
+ const EntityName& entity_name,
+ uint32_t want_keys,
+ RotatingKeyRing* keyring)
+{
+ ldout(cct, 10) << __func__ << dendl;
+ int r = _init_auth(m->protocol, entity_name, want_keys, keyring, false);
+ if (r == -ENOTSUP) {
+ if (m->result == -ENOTSUP) {
+ ldout(cct, 10) << "none of our auth protocols are supported by the server"
+ << dendl;
+ }
+ return m->result;
+ }
+ return r;
+}
+
+int MonConnection::_init_auth(
+ uint32_t method,
+ const EntityName& entity_name,
+ uint32_t want_keys,
+ RotatingKeyRing* keyring,
+ bool msgr2)
+{
+ ldout(cct, 10) << __func__ << " method " << method << dendl;
+ if (auth && auth->get_protocol() == (int)method) {
+ ldout(cct, 10) << __func__ << " already have auth, reseting" << dendl;
+ auth->reset();
+ return 0;
+ }
+
+ ldout(cct, 10) << __func__ << " creating new auth" << dendl;
+ auth.reset(AuthClientHandler::create(cct, method, keyring));
+ if (!auth) {
+ ldout(cct, 10) << " no handler for protocol " << method << dendl;
+ return -ENOTSUP;
+ }
+
+ // do not request MGR key unless the mon has the SERVER_KRAKEN
+ // feature. otherwise it will give us an auth error. note that
+ // we have to use the FEATUREMASK because pre-jewel the kraken
+ // feature bit was used for something else.
+ if (!msgr2 &&
+ (want_keys & CEPH_ENTITY_TYPE_MGR) &&
+ !(con->has_features(CEPH_FEATUREMASK_SERVER_KRAKEN))) {
+ ldout(cct, 1) << __func__
+ << " not requesting MGR keys from pre-kraken monitor"
+ << dendl;
+ want_keys &= ~CEPH_ENTITY_TYPE_MGR;
+ }
+ auth->set_want_keys(want_keys);
+ auth->init(entity_name);
+ auth->set_global_id(global_id);
+ return 0;
+}
+
+int MonConnection::authenticate(MAuthReply *m)
+{
+ ceph_assert(auth);
+ if (!m->global_id) {
+ ldout(cct, 1) << "peer sent an invalid global_id" << dendl;
+ }
+ if (m->global_id != global_id) {
+ // it's a new session
+ auth->reset();
+ global_id = m->global_id;
+ auth->set_global_id(global_id);
+ ldout(cct, 10) << "my global_id is " << m->global_id << dendl;
+ }
+ auto p = m->result_bl.cbegin();
+ int ret = auth->handle_response(m->result, p, nullptr, nullptr);
+ if (ret == -EAGAIN) {
+ auto ma = new MAuth;
+ ma->protocol = auth->get_protocol();
+ auth->prepare_build_request();
+ auth->build_request(ma->auth_payload);
+ con->send_message(ma);
+ }
+ if (ret == 0 && pending_tell_command) {
+ con->send_message2(std::move(pending_tell_command));
+ }
+
+ return ret;
+}
+
+void MonClient::register_config_callback(md_config_t::config_callback fn) {
+ ceph_assert(!config_cb);
+ config_cb = fn;
+}
+
+md_config_t::config_callback MonClient::get_config_callback() {
+ return config_cb;
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
+class monc_error_category : public ceph::converting_category {
+public:
+ monc_error_category(){}
+ const char* name() const noexcept override;
+ const char* message(int ev, char*, std::size_t) const noexcept override;
+ std::string message(int ev) const override;
+ bs::error_condition default_error_condition(int ev) const noexcept
+ override;
+ bool equivalent(int ev, const bs::error_condition& c) const
+ noexcept override;
+ using ceph::converting_category::equivalent;
+ int from_code(int ev) const noexcept override;
+};
+#pragma GCC diagnostic pop
+#pragma clang diagnostic pop
+
+const char* monc_error_category::name() const noexcept {
+ return "monc";
+}
+
+const char* monc_error_category::message(int ev, char*, std::size_t) const noexcept {
+ if (ev == 0)
+ return "No error";
+
+ switch (static_cast<monc_errc>(ev)) {
+ case monc_errc::shutting_down: // Command failed due to MonClient shutting down
+ return "Command failed due to MonClient shutting down";
+ case monc_errc::session_reset:
+ return "Monitor session was reset";
+ case monc_errc::rank_dne:
+ return "Requested monitor rank does not exist";
+ case monc_errc::mon_dne:
+ return "Requested monitor does not exist";
+ case monc_errc::timed_out:
+ return "Monitor operation timed out";
+ case monc_errc::mon_unavailable:
+ return "Monitor unavailable";
+ }
+
+ return "Unknown error";
+}
+
+std::string monc_error_category::message(int ev) const {
+ return message(ev, nullptr, 0);
+}
+
+bs::error_condition monc_error_category::default_error_condition(int ev) const noexcept {
+ switch (static_cast<monc_errc>(ev)) {
+ case monc_errc::shutting_down:
+ return bs::errc::operation_canceled;
+ case monc_errc::session_reset:
+ return bs::errc::resource_unavailable_try_again;
+ case monc_errc::rank_dne:
+ [[fallthrough]];
+ case monc_errc::mon_dne:
+ return ceph::errc::not_in_map;
+ case monc_errc::timed_out:
+ return bs::errc::timed_out;
+ case monc_errc::mon_unavailable:
+ return bs::errc::no_such_device;
+ }
+ return { ev, *this };
+}
+
+bool monc_error_category::equivalent(int ev, const bs::error_condition& c) const noexcept {
+ switch (static_cast<monc_errc>(ev)) {
+ case monc_errc::rank_dne:
+ [[fallthrough]];
+ case monc_errc::mon_dne:
+ return c == bs::errc::no_such_file_or_directory;
+ default:
+ return default_error_condition(ev) == c;
+ }
+}
+
+int monc_error_category::from_code(int ev) const noexcept {
+ if (ev == 0)
+ return 0;
+
+ switch (static_cast<monc_errc>(ev)) {
+ case monc_errc::shutting_down:
+ return -ECANCELED;
+ case monc_errc::session_reset:
+ return -EAGAIN;
+ case monc_errc::rank_dne:
+ [[fallthrough]];
+ case monc_errc::mon_dne:
+ return -ENOENT;
+ case monc_errc::timed_out:
+ return -ETIMEDOUT;
+ case monc_errc::mon_unavailable:
+ return -ENXIO;
+ }
+ return -EDOM;
+}
+
+const bs::error_category& monc_category() noexcept {
+ static const monc_error_category c;
+ return c;
+}
diff --git a/src/mon/MonClient.h b/src/mon/MonClient.h
new file mode 100644
index 000000000..803c74eb7
--- /dev/null
+++ b/src/mon/MonClient.h
@@ -0,0 +1,789 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_MONCLIENT_H
+#define CEPH_MONCLIENT_H
+
+#include <functional>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include <boost/asio/io_context.hpp>
+#include <boost/asio/steady_timer.hpp>
+#include <boost/asio/strand.hpp>
+
+#include "msg/Messenger.h"
+
+#include "MonMap.h"
+#include "MonSub.h"
+
+#include "common/admin_socket.h"
+#include "common/async/completion.h"
+#include "common/Timer.h"
+#include "common/config.h"
+#include "messages/MMonGetVersion.h"
+
+#include "auth/AuthClient.h"
+#include "auth/AuthServer.h"
+
+class MMonMap;
+class MConfig;
+class MMonGetVersionReply;
+class MMonCommandAck;
+class LogClient;
+class AuthClientHandler;
+class AuthRegistry;
+class KeyRing;
+class RotatingKeyRing;
+
+class MonConnection {
+public:
+ MonConnection(CephContext *cct,
+ ConnectionRef conn,
+ uint64_t global_id,
+ AuthRegistry *auth_registry);
+ ~MonConnection();
+ MonConnection(MonConnection&& rhs) = default;
+ MonConnection& operator=(MonConnection&&) = default;
+ MonConnection(const MonConnection& rhs) = delete;
+ MonConnection& operator=(const MonConnection&) = delete;
+ int handle_auth(MAuthReply *m,
+ const EntityName& entity_name,
+ uint32_t want_keys,
+ RotatingKeyRing* keyring);
+ int authenticate(MAuthReply *m);
+ void start(epoch_t epoch,
+ const EntityName& entity_name);
+ bool have_session() const;
+ uint64_t get_global_id() const {
+ return global_id;
+ }
+ ConnectionRef get_con() {
+ return con;
+ }
+ std::unique_ptr<AuthClientHandler>& get_auth() {
+ return auth;
+ }
+
+ int get_auth_request(
+ uint32_t *method,
+ std::vector<uint32_t> *preferred_modes,
+ ceph::buffer::list *out,
+ const EntityName& entity_name,
+ uint32_t want_keys,
+ RotatingKeyRing* keyring);
+ int handle_auth_reply_more(
+ AuthConnectionMeta *auth_meta,
+ const ceph::buffer::list& bl,
+ ceph::buffer::list *reply);
+ int handle_auth_done(
+ AuthConnectionMeta *auth_meta,
+ uint64_t global_id,
+ const ceph::buffer::list& bl,
+ CryptoKey *session_key,
+ std::string *connection_secret);
+ int handle_auth_bad_method(
+ uint32_t old_auth_method,
+ int result,
+ const std::vector<uint32_t>& allowed_methods,
+ const std::vector<uint32_t>& allowed_modes);
+
+ bool is_con(Connection *c) const {
+ return con.get() == c;
+ }
+ void queue_command(Message *m) {
+ pending_tell_command = m;
+ }
+
+private:
+ int _negotiate(MAuthReply *m,
+ const EntityName& entity_name,
+ uint32_t want_keys,
+ RotatingKeyRing* keyring);
+ int _init_auth(uint32_t method,
+ const EntityName& entity_name,
+ uint32_t want_keys,
+ RotatingKeyRing* keyring,
+ bool msgr2);
+
+private:
+ CephContext *cct;
+ enum class State {
+ NONE,
+ NEGOTIATING, // v1 only
+ AUTHENTICATING, // v1 and v2
+ HAVE_SESSION,
+ };
+ State state = State::NONE;
+ ConnectionRef con;
+ int auth_method = -1;
+ utime_t auth_start;
+
+ std::unique_ptr<AuthClientHandler> auth;
+ uint64_t global_id;
+
+ MessageRef pending_tell_command;
+
+ AuthRegistry *auth_registry;
+};
+
+
+struct MonClientPinger : public Dispatcher,
+ public AuthClient {
+ ceph::mutex lock = ceph::make_mutex("MonClientPinger::lock");
+ ceph::condition_variable ping_recvd_cond;
+ std::string *result;
+ bool done;
+ RotatingKeyRing *keyring;
+ std::unique_ptr<MonConnection> mc;
+
+ MonClientPinger(CephContext *cct_,
+ RotatingKeyRing *keyring,
+ std::string *res_) :
+ Dispatcher(cct_),
+ result(res_),
+ done(false),
+ keyring(keyring)
+ { }
+
+ int wait_for_reply(double timeout = 0.0) {
+ std::unique_lock locker{lock};
+ if (timeout <= 0) {
+ timeout = std::chrono::duration<double>(cct->_conf.get_val<std::chrono::seconds>("client_mount_timeout")).count();
+ }
+ done = false;
+ if (ping_recvd_cond.wait_for(locker,
+ ceph::make_timespan(timeout),
+ [this] { return done; })) {
+ return 0;
+ } else {
+ return ETIMEDOUT;
+ }
+ }
+
+ bool ms_dispatch(Message *m) override {
+ using ceph::decode;
+ std::lock_guard l(lock);
+ if (m->get_type() != CEPH_MSG_PING)
+ return false;
+
+ ceph::buffer::list &payload = m->get_payload();
+ if (result && payload.length() > 0) {
+ auto p = std::cbegin(payload);
+ decode(*result, p);
+ }
+ done = true;
+ ping_recvd_cond.notify_all();
+ m->put();
+ return true;
+ }
+ bool ms_handle_reset(Connection *con) override {
+ std::lock_guard l(lock);
+ done = true;
+ ping_recvd_cond.notify_all();
+ return true;
+ }
+ void ms_handle_remote_reset(Connection *con) override {}
+ bool ms_handle_refused(Connection *con) override {
+ return false;
+ }
+
+ // AuthClient
+ int get_auth_request(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ uint32_t *auth_method,
+ std::vector<uint32_t> *preferred_modes,
+ ceph::buffer::list *bl) override {
+ return mc->get_auth_request(auth_method, preferred_modes, bl,
+ cct->_conf->name, 0, keyring);
+ }
+ int handle_auth_reply_more(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ const ceph::buffer::list& bl,
+ ceph::buffer::list *reply) override {
+ return mc->handle_auth_reply_more(auth_meta, bl, reply);
+ }
+ int handle_auth_done(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ uint64_t global_id,
+ uint32_t con_mode,
+ const ceph::buffer::list& bl,
+ CryptoKey *session_key,
+ std::string *connection_secret) override {
+ return mc->handle_auth_done(auth_meta, global_id, bl,
+ session_key, connection_secret);
+ }
+ int handle_auth_bad_method(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ uint32_t old_auth_method,
+ int result,
+ const std::vector<uint32_t>& allowed_methods,
+ const std::vector<uint32_t>& allowed_modes) override {
+ return mc->handle_auth_bad_method(old_auth_method, result,
+ allowed_methods, allowed_modes);
+ }
+};
+
+const boost::system::error_category& monc_category() noexcept;
+
+enum class monc_errc {
+ shutting_down = 1, // Command failed due to MonClient shutting down
+ session_reset, // Monitor session was reset
+ rank_dne, // Requested monitor rank does not exist
+ mon_dne, // Requested monitor does not exist
+ timed_out, // Monitor operation timed out
+ mon_unavailable // Monitor unavailable
+};
+
+namespace boost::system {
+template<>
+struct is_error_code_enum<::monc_errc> {
+ static const bool value = true;
+};
+}
+
+// implicit conversion:
+inline boost::system::error_code make_error_code(monc_errc e) noexcept {
+ return { static_cast<int>(e), monc_category() };
+}
+
+// explicit conversion:
+inline boost::system::error_condition make_error_condition(monc_errc e) noexcept {
+ return { static_cast<int>(e), monc_category() };
+}
+
+const boost::system::error_category& monc_category() noexcept;
+
+class MonClient : public Dispatcher,
+ public AuthClient,
+ public AuthServer, /* for mgr, osd, mds */
+ public AdminSocketHook {
+ static constexpr auto dout_subsys = ceph_subsys_monc;
+public:
+ // Error, Newest, Oldest
+ using VersionSig = void(boost::system::error_code, version_t, version_t);
+ using VersionCompletion = ceph::async::Completion<VersionSig>;
+
+ using CommandSig = void(boost::system::error_code, std::string,
+ ceph::buffer::list);
+ using CommandCompletion = ceph::async::Completion<CommandSig>;
+
+ MonMap monmap;
+ std::map<std::string,std::string> config_mgr;
+
+private:
+ Messenger *messenger;
+
+ std::unique_ptr<MonConnection> active_con;
+ std::map<entity_addrvec_t, MonConnection> pending_cons;
+ std::set<unsigned> tried;
+
+ EntityName entity_name;
+
+ mutable ceph::mutex monc_lock = ceph::make_mutex("MonClient::monc_lock");
+ SafeTimer timer;
+ boost::asio::io_context& service;
+ boost::asio::strand<boost::asio::io_context::executor_type>
+ finish_strand{service.get_executor()};
+
+ bool initialized;
+ bool stopping = false;
+
+ LogClient *log_client;
+ bool more_log_pending;
+
+ void send_log(bool flush = false);
+
+ bool ms_dispatch(Message *m) override;
+ bool ms_handle_reset(Connection *con) override;
+ void ms_handle_remote_reset(Connection *con) override {}
+ bool ms_handle_refused(Connection *con) override { return false; }
+
+ void handle_monmap(MMonMap *m);
+ void handle_config(MConfig *m);
+
+ void handle_auth(MAuthReply *m);
+
+ int call(
+ std::string_view command,
+ const cmdmap_t& cmdmap,
+ const ceph::buffer::list &inbl,
+ ceph::Formatter *f,
+ std::ostream& errss,
+ ceph::buffer::list& out) override;
+
+ // monitor session
+ utime_t last_keepalive;
+ utime_t last_send_log;
+
+ void tick();
+ void schedule_tick();
+
+ // monclient
+ bool want_monmap;
+ ceph::condition_variable map_cond;
+ bool passthrough_monmap = false;
+
+ bool want_bootstrap_config = false;
+ ceph::ref_t<MConfig> bootstrap_config;
+
+ // authenticate
+ std::unique_ptr<AuthClientHandler> auth;
+ uint32_t want_keys = 0;
+ uint64_t global_id = 0;
+ ceph::condition_variable auth_cond;
+ int authenticate_err = 0;
+ bool authenticated = false;
+
+ std::list<MessageRef> waiting_for_session;
+ utime_t last_rotating_renew_sent;
+ bool had_a_connection;
+ double reopen_interval_multiplier;
+
+ Dispatcher *handle_authentication_dispatcher = nullptr;
+ bool _opened() const;
+ bool _hunting() const;
+ void _start_hunting();
+ void _finish_hunting(int auth_err);
+ void _finish_auth(int auth_err);
+ void _reopen_session(int rank = -1);
+ void _add_conn(unsigned rank);
+ void _add_conns();
+ void _un_backoff();
+ void _send_mon_message(MessageRef m);
+
+ std::map<entity_addrvec_t, MonConnection>::iterator _find_pending_con(
+ const ConnectionRef& con) {
+ for (auto i = pending_cons.begin(); i != pending_cons.end(); ++i) {
+ if (i->second.get_con() == con) {
+ return i;
+ }
+ }
+ return pending_cons.end();
+ }
+
+public:
+ // AuthClient
+ int get_auth_request(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ uint32_t *method,
+ std::vector<uint32_t> *preferred_modes,
+ ceph::buffer::list *bl) override;
+ int handle_auth_reply_more(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ const ceph::buffer::list& bl,
+ ceph::buffer::list *reply) override;
+ int handle_auth_done(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ uint64_t global_id,
+ uint32_t con_mode,
+ const ceph::buffer::list& bl,
+ CryptoKey *session_key,
+ std::string *connection_secret) override;
+ int handle_auth_bad_method(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ uint32_t old_auth_method,
+ int result,
+ const std::vector<uint32_t>& allowed_methods,
+ const std::vector<uint32_t>& allowed_modes) override;
+ // AuthServer
+ int handle_auth_request(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ bool more,
+ uint32_t auth_method,
+ const ceph::buffer::list& bl,
+ ceph::buffer::list *reply) override;
+
+ void set_entity_name(EntityName name) { entity_name = name; }
+ void set_handle_authentication_dispatcher(Dispatcher *d) {
+ handle_authentication_dispatcher = d;
+ }
+ int _check_auth_tickets();
+ int _check_auth_rotating();
+ int wait_auth_rotating(double timeout);
+
+ int authenticate(double timeout=0.0);
+ bool is_authenticated() const {return authenticated;}
+
+ bool is_connected() const { return active_con != nullptr; }
+
+ /**
+ * Try to flush as many log messages as we can in a single
+ * message. Use this before shutting down to transmit your
+ * last message.
+ */
+ void flush_log();
+
+private:
+ // mon subscriptions
+ MonSub sub;
+ void _renew_subs();
+ void handle_subscribe_ack(MMonSubscribeAck* m);
+
+public:
+ void renew_subs() {
+ std::lock_guard l(monc_lock);
+ _renew_subs();
+ }
+ bool sub_want(std::string what, version_t start, unsigned flags) {
+ std::lock_guard l(monc_lock);
+ return sub.want(what, start, flags);
+ }
+ void sub_got(std::string what, version_t have) {
+ std::lock_guard l(monc_lock);
+ sub.got(what, have);
+ }
+ void sub_unwant(std::string what) {
+ std::lock_guard l(monc_lock);
+ sub.unwant(what);
+ }
+ bool sub_want_increment(std::string what, version_t start, unsigned flags) {
+ std::lock_guard l(monc_lock);
+ return sub.inc_want(what, start, flags);
+ }
+
+ std::unique_ptr<KeyRing> keyring;
+ std::unique_ptr<RotatingKeyRing> rotating_secrets;
+
+ public:
+ MonClient(CephContext *cct_, boost::asio::io_context& service);
+ MonClient(const MonClient &) = delete;
+ MonClient& operator=(const MonClient &) = delete;
+ ~MonClient() override;
+
+ int init();
+ void shutdown();
+
+ void set_log_client(LogClient *clog) {
+ log_client = clog;
+ }
+ LogClient *get_log_client() {
+ return log_client;
+ }
+
+ int build_initial_monmap();
+ int get_monmap();
+ int get_monmap_and_config();
+ /**
+ * If you want to see MonMap messages, set this and
+ * the MonClient will tell the Messenger it hasn't
+ * dealt with it.
+ * Note that if you do this, *you* are of course responsible for
+ * putting the message reference!
+ */
+ void set_passthrough_monmap() {
+ std::lock_guard l(monc_lock);
+ passthrough_monmap = true;
+ }
+ void unset_passthrough_monmap() {
+ std::lock_guard l(monc_lock);
+ passthrough_monmap = false;
+ }
+ /**
+ * Ping monitor with ID @p mon_id and record the resulting
+ * reply in @p result_reply.
+ *
+ * @param[in] mon_id Target monitor's ID
+ * @param[out] result_reply reply from mon.ID, if param != NULL
+ * @returns 0 in case of success; < 0 in case of error,
+ * -ETIMEDOUT if monitor didn't reply before timeout
+ * expired (default: conf->client_mount_timeout).
+ */
+ int ping_monitor(const std::string &mon_id, std::string *result_reply);
+
+ void send_mon_message(Message *m) {
+ send_mon_message(MessageRef{m, false});
+ }
+ void send_mon_message(MessageRef m);
+
+ void reopen_session() {
+ std::lock_guard l(monc_lock);
+ _reopen_session();
+ }
+
+ const uuid_d& get_fsid() const {
+ return monmap.fsid;
+ }
+
+ entity_addrvec_t get_mon_addrs(unsigned i) const {
+ std::lock_guard l(monc_lock);
+ if (i < monmap.size())
+ return monmap.get_addrs(i);
+ return entity_addrvec_t();
+ }
+ int get_num_mon() const {
+ std::lock_guard l(monc_lock);
+ return monmap.size();
+ }
+
+ uint64_t get_global_id() const {
+ std::lock_guard l(monc_lock);
+ return global_id;
+ }
+
+ void set_messenger(Messenger *m) { messenger = m; }
+ entity_addrvec_t get_myaddrs() const { return messenger->get_myaddrs(); }
+ AuthAuthorizer* build_authorizer(int service_id) const;
+
+ void set_want_keys(uint32_t want) {
+ want_keys = want;
+ }
+
+ // admin commands
+private:
+ uint64_t last_mon_command_tid;
+
+ struct MonCommand {
+ // for tell only
+ std::string target_name;
+ int target_rank = -1;
+ ConnectionRef target_con;
+ std::unique_ptr<MonConnection> target_session;
+ unsigned send_attempts = 0; ///< attempt count for legacy mons
+ utime_t last_send_attempt;
+ uint64_t tid;
+ std::vector<std::string> cmd;
+ ceph::buffer::list inbl;
+ std::unique_ptr<CommandCompletion> onfinish;
+ std::optional<boost::asio::steady_timer> cancel_timer;
+
+ MonCommand(MonClient& monc, uint64_t t, std::unique_ptr<CommandCompletion> onfinish)
+ : tid(t), onfinish(std::move(onfinish)) {
+ auto timeout =
+ monc.cct->_conf.get_val<std::chrono::seconds>("rados_mon_op_timeout");
+ if (timeout.count() > 0) {
+ cancel_timer.emplace(monc.service, timeout);
+ cancel_timer->async_wait(
+ [this, &monc](boost::system::error_code ec) {
+ if (ec)
+ return;
+ std::scoped_lock l(monc.monc_lock);
+ monc._cancel_mon_command(tid);
+ });
+ }
+ }
+
+ bool is_tell() const {
+ return target_name.size() || target_rank >= 0;
+ }
+ };
+ friend MonCommand;
+ std::map<uint64_t,MonCommand*> mon_commands;
+
+ void _send_command(MonCommand *r);
+ void _check_tell_commands();
+ void _resend_mon_commands();
+ int _cancel_mon_command(uint64_t tid);
+ void _finish_command(MonCommand *r, boost::system::error_code ret, std::string_view rs,
+ bufferlist&& bl);
+ void _finish_auth();
+ void handle_mon_command_ack(MMonCommandAck *ack);
+ void handle_command_reply(MCommandReply *reply);
+
+public:
+ template<typename CompletionToken>
+ auto start_mon_command(const std::vector<std::string>& cmd,
+ const ceph::buffer::list& inbl,
+ CompletionToken&& token) {
+ ldout(cct,10) << __func__ << " cmd=" << cmd << dendl;
+ boost::asio::async_completion<CompletionToken, CommandSig> init(token);
+ {
+ std::scoped_lock l(monc_lock);
+ auto h = CommandCompletion::create(service.get_executor(),
+ std::move(init.completion_handler));
+ if (!initialized || stopping) {
+ ceph::async::post(std::move(h), monc_errc::shutting_down, std::string{},
+ bufferlist{});
+ } else {
+ auto r = new MonCommand(*this, ++last_mon_command_tid, std::move(h));
+ r->cmd = cmd;
+ r->inbl = inbl;
+ mon_commands.emplace(r->tid, r);
+ _send_command(r);
+ }
+ }
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto start_mon_command(int mon_rank, const std::vector<std::string>& cmd,
+ const ceph::buffer::list& inbl, CompletionToken&& token) {
+ ldout(cct,10) << __func__ << " cmd=" << cmd << dendl;
+ boost::asio::async_completion<CompletionToken, CommandSig> init(token);
+ {
+ std::scoped_lock l(monc_lock);
+ auto h = CommandCompletion::create(service.get_executor(),
+ std::move(init.completion_handler));
+ if (!initialized || stopping) {
+ ceph::async::post(std::move(h), monc_errc::shutting_down, std::string{},
+ bufferlist{});
+ } else {
+ auto r = new MonCommand(*this, ++last_mon_command_tid, std::move(h));
+ r->target_rank = mon_rank;
+ r->cmd = cmd;
+ r->inbl = inbl;
+ mon_commands.emplace(r->tid, r);
+ _send_command(r);
+ }
+ }
+ return init.result.get();
+ }
+
+ template<typename CompletionToken>
+ auto start_mon_command(const std::string& mon_name,
+ const std::vector<std::string>& cmd,
+ const ceph::buffer::list& inbl,
+ CompletionToken&& token) {
+ ldout(cct,10) << __func__ << " cmd=" << cmd << dendl;
+ boost::asio::async_completion<CompletionToken, CommandSig> init(token);
+ {
+ std::scoped_lock l(monc_lock);
+ auto h = CommandCompletion::create(service.get_executor(),
+ std::move(init.completion_handler));
+ if (!initialized || stopping) {
+ ceph::async::post(std::move(h), monc_errc::shutting_down, std::string{},
+ bufferlist{});
+ } else {
+ auto r = new MonCommand(*this, ++last_mon_command_tid, std::move(h));
+ // detect/tolerate mon *rank* passed as a string
+ std::string err;
+ int rank = strict_strtoll(mon_name.c_str(), 10, &err);
+ if (err.size() == 0 && rank >= 0) {
+ ldout(cct,10) << __func__ << " interpreting name '" << mon_name
+ << "' as rank " << rank << dendl;
+ r->target_rank = rank;
+ } else {
+ r->target_name = mon_name;
+ }
+ r->cmd = cmd;
+ r->inbl = inbl;
+ mon_commands.emplace(r->tid, r);
+ _send_command(r);
+ }
+ }
+ return init.result.get();
+ }
+
+ class ContextVerter {
+ std::string* outs;
+ ceph::bufferlist* outbl;
+ Context* onfinish;
+
+ public:
+ ContextVerter(std::string* outs, ceph::bufferlist* outbl, Context* onfinish)
+ : outs(outs), outbl(outbl), onfinish(onfinish) {}
+ ~ContextVerter() = default;
+ ContextVerter(const ContextVerter&) = default;
+ ContextVerter& operator =(const ContextVerter&) = default;
+ ContextVerter(ContextVerter&&) = default;
+ ContextVerter& operator =(ContextVerter&&) = default;
+
+ void operator()(boost::system::error_code e,
+ std::string s,
+ ceph::bufferlist bl) {
+ if (outs)
+ *outs = std::move(s);
+ if (outbl)
+ *outbl = std::move(bl);
+ if (onfinish)
+ onfinish->complete(ceph::from_error_code(e));
+ }
+ };
+
+ void start_mon_command(const std::vector<std::string>& cmd, const bufferlist& inbl,
+ bufferlist *outbl, std::string *outs,
+ Context *onfinish) {
+ start_mon_command(cmd, inbl, ContextVerter(outs, outbl, onfinish));
+ }
+ void start_mon_command(int mon_rank,
+ const std::vector<std::string>& cmd, const bufferlist& inbl,
+ bufferlist *outbl, std::string *outs,
+ Context *onfinish) {
+ start_mon_command(mon_rank, cmd, inbl, ContextVerter(outs, outbl, onfinish));
+ }
+ void start_mon_command(const std::string &mon_name, ///< mon name, with mon. prefix
+ const std::vector<std::string>& cmd, const bufferlist& inbl,
+ bufferlist *outbl, std::string *outs,
+ Context *onfinish) {
+ start_mon_command(mon_name, cmd, inbl, ContextVerter(outs, outbl, onfinish));
+ }
+
+
+ // version requests
+public:
+ /**
+ * get latest known version(s) of cluster map
+ *
+ * @param map string name of map (e.g., 'osdmap')
+ * @param token context that will be triggered on completion
+ * @return (via Completion) {} on success,
+ * boost::system::errc::resource_unavailable_try_again if we need to
+ * resubmit our request
+ */
+ template<typename CompletionToken>
+ auto get_version(std::string&& map, CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, VersionSig> init(token);
+ {
+ std::scoped_lock l(monc_lock);
+ auto m = ceph::make_message<MMonGetVersion>();
+ m->what = std::move(map);
+ m->handle = ++version_req_id;
+ version_requests.emplace(m->handle,
+ VersionCompletion::create(
+ service.get_executor(),
+ std::move(init.completion_handler)));
+ _send_mon_message(m);
+ }
+ return init.result.get();
+ }
+
+ /**
+ * Run a callback within our lock, with a reference
+ * to the MonMap
+ */
+ template<typename Callback, typename...Args>
+ auto with_monmap(Callback&& cb, Args&&...args) const ->
+ decltype(cb(monmap, std::forward<Args>(args)...)) {
+ std::lock_guard l(monc_lock);
+ return std::forward<Callback>(cb)(monmap, std::forward<Args>(args)...);
+ }
+
+ void register_config_callback(md_config_t::config_callback fn);
+ void register_config_notify_callback(std::function<void(void)> f) {
+ config_notify_cb = f;
+ }
+ md_config_t::config_callback get_config_callback();
+
+private:
+
+ std::map<ceph_tid_t, std::unique_ptr<VersionCompletion>> version_requests;
+ ceph_tid_t version_req_id;
+ void handle_get_version_reply(MMonGetVersionReply* m);
+ md_config_t::config_callback config_cb;
+ std::function<void(void)> config_notify_cb;
+};
+
+#endif
diff --git a/src/mon/MonCommand.h b/src/mon/MonCommand.h
new file mode 100644
index 000000000..2421229c8
--- /dev/null
+++ b/src/mon/MonCommand.h
@@ -0,0 +1,194 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#pragma once
+
+#include <string>
+#include "common/Formatter.h"
+#include "include/encoding.h"
+
+struct MonCommand {
+ std::string cmdstring;
+ std::string helpstring;
+ std::string module;
+ std::string req_perms;
+ uint64_t flags = 0;
+
+ // MonCommand flags
+ static const uint64_t FLAG_NONE = 0;
+ static const uint64_t FLAG_NOFORWARD = 1 << 0;
+ static const uint64_t FLAG_OBSOLETE = 1 << 1;
+ static const uint64_t FLAG_DEPRECATED = 1 << 2;
+ static const uint64_t FLAG_MGR = 1 << 3;
+ static const uint64_t FLAG_POLL = 1 << 4;
+ static const uint64_t FLAG_HIDDEN = 1 << 5;
+ // asok and tell commands are not forwarded, and they should not be listed
+ // in --help output.
+ static const uint64_t FLAG_TELL = (FLAG_NOFORWARD | FLAG_HIDDEN);
+
+ bool has_flag(uint64_t flag) const { return (flags & flag) == flag; }
+ void set_flag(uint64_t flag) { flags |= flag; }
+ void unset_flag(uint64_t flag) { flags &= ~flag; }
+
+ void encode(ceph::buffer::list &bl) const {
+ ENCODE_START(1, 1, bl);
+ encode_bare(bl);
+ encode(flags, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(ceph::buffer::list::const_iterator &bl) {
+ DECODE_START(1, bl);
+ decode_bare(bl);
+ decode(flags, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(ceph::Formatter *f) const {
+ f->dump_string("cmdstring", cmdstring);
+ f->dump_string("helpstring", helpstring);
+ f->dump_string("module", module);
+ f->dump_string("req_perms", req_perms);
+ f->dump_unsigned("flags", flags);
+ }
+
+ static void generate_test_instances(std::list<MonCommand*>& ls) {
+ ls.push_back(new MonCommand);
+ ls.push_back(new MonCommand);
+ ls.back()->cmdstring = "foo";
+ ls.back()->helpstring = "bar";
+ ls.back()->module = "baz";
+ ls.back()->req_perms = "quux";
+ ls.back()->flags = FLAG_NOFORWARD;
+ }
+
+ /**
+ * Unversioned encoding for use within encode_array.
+ */
+ void encode_bare(ceph::buffer::list &bl) const {
+ using ceph::encode;
+ encode(cmdstring, bl);
+ encode(helpstring, bl);
+ encode(module, bl);
+ encode(req_perms, bl);
+ std::string availability = "cli,rest"; // Removed field, for backward compat
+ encode(availability, bl);
+ }
+ void decode_bare(ceph::buffer::list::const_iterator &bl) {
+ using ceph::decode;
+ decode(cmdstring, bl);
+ decode(helpstring, bl);
+ decode(module, bl);
+ decode(req_perms, bl);
+ std::string availability; // Removed field, for backward compat
+ decode(availability, bl);
+ }
+ bool is_compat(const MonCommand* o) const {
+ return cmdstring == o->cmdstring &&
+ module == o->module && req_perms == o->req_perms;
+ }
+
+ bool is_tell() const {
+ return has_flag(MonCommand::FLAG_TELL);
+ }
+
+ bool is_noforward() const {
+ return has_flag(MonCommand::FLAG_NOFORWARD);
+ }
+
+ bool is_obsolete() const {
+ return has_flag(MonCommand::FLAG_OBSOLETE);
+ }
+
+ bool is_deprecated() const {
+ return has_flag(MonCommand::FLAG_DEPRECATED);
+ }
+
+ bool is_mgr() const {
+ return has_flag(MonCommand::FLAG_MGR);
+ }
+
+ bool is_hidden() const {
+ return has_flag(MonCommand::FLAG_HIDDEN);
+ }
+
+ static void encode_array(const MonCommand *cmds, int size, ceph::buffer::list &bl) {
+ ENCODE_START(2, 1, bl);
+ uint16_t s = size;
+ encode(s, bl);
+ for (int i = 0; i < size; ++i) {
+ cmds[i].encode_bare(bl);
+ }
+ for (int i = 0; i < size; i++) {
+ encode(cmds[i].flags, bl);
+ }
+ ENCODE_FINISH(bl);
+ }
+ static void decode_array(MonCommand **cmds, int *size,
+ ceph::buffer::list::const_iterator &bl) {
+ DECODE_START(2, bl);
+ uint16_t s = 0;
+ decode(s, bl);
+ *size = s;
+ *cmds = new MonCommand[*size];
+ for (int i = 0; i < *size; ++i) {
+ (*cmds)[i].decode_bare(bl);
+ }
+ if (struct_v >= 2) {
+ for (int i = 0; i < *size; i++)
+ decode((*cmds)[i].flags, bl);
+ } else {
+ for (int i = 0; i < *size; i++)
+ (*cmds)[i].flags = 0;
+ }
+ DECODE_FINISH(bl);
+ }
+
+ // this uses a u16 for the count, so we need a special encoder/decoder.
+ static void encode_vector(const std::vector<MonCommand>& cmds,
+ ceph::buffer::list &bl) {
+ ENCODE_START(2, 1, bl);
+ uint16_t s = cmds.size();
+ encode(s, bl);
+ for (unsigned i = 0; i < s; ++i) {
+ cmds[i].encode_bare(bl);
+ }
+ for (unsigned i = 0; i < s; i++) {
+ encode(cmds[i].flags, bl);
+ }
+ ENCODE_FINISH(bl);
+ }
+ static void decode_vector(std::vector<MonCommand> &cmds,
+ ceph::buffer::list::const_iterator &bl) {
+ DECODE_START(2, bl);
+ uint16_t s = 0;
+ decode(s, bl);
+ cmds.resize(s);
+ for (unsigned i = 0; i < s; ++i) {
+ cmds[i].decode_bare(bl);
+ }
+ if (struct_v >= 2) {
+ for (unsigned i = 0; i < s; i++)
+ decode(cmds[i].flags, bl);
+ } else {
+ for (unsigned i = 0; i < s; i++)
+ cmds[i].flags = 0;
+ }
+ DECODE_FINISH(bl);
+ }
+
+ bool requires_perm(char p) const {
+ return (req_perms.find(p) != std::string::npos);
+ }
+};
+WRITE_CLASS_ENCODER(MonCommand)
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
new file mode 100644
index 000000000..1f40621ce
--- /dev/null
+++ b/src/mon/MonCommands.h
@@ -0,0 +1,1453 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/* no guard; may be included multiple times */
+
+/*
+ * Define commands that are reported by the monitor's
+ * "get_command_descriptions" command, and parsed by the Python
+ * frontend 'ceph' (and perhaps by other frontends, such as a RESTful
+ * server). The format is:
+ *
+ * COMMAND(signature, helpstring, modulename, req perms, availability)
+ * where:
+ * signature: describes the command and its parameters (more below)
+ * helpstring: displays in CLI help, API help (nice if it refers to
+ * parameter names from signature, 40-a few hundred chars)
+ * modulename: the monitor module or daemon this applies to:
+ * mds, osd, pg (osd), mon, auth, log, config-key, mgr
+ * req perms: required permission in that modulename space to execute command
+ * this also controls what type of REST command is accepted
+ *
+ * The commands describe themselves completely enough for the separate
+ * frontend(s) to be able to accept user input and validate it against
+ * the command descriptions, and generate a JSON object that contains
+ * key:value mappings of parameter names to validated parameter values.
+ *
+ * 'signature' is a space-separated list of individual command descriptors;
+ * each descriptor is either a literal string, which can contain no spaces or
+ * '=' signs (for instance, in "pg stat", both "pg" and "stat" are literal
+ * strings representing one descriptor each), or a list of key=val[,key=val...]
+ * which also includes no spaces.
+ *
+ * The key=val form describes a non-literal parameter. Each will have at
+ * least a name= and type=, and each type can have its own type-specific
+ * parameters. The parser is the arbiter of these types and their
+ * interpretation. A few more non-type-specific key=val pairs exist:
+ *
+ * req=false marks an optional parameter (default for req is 'true')
+ * n=<n> is a repeat count for how many of this argument must be supplied.
+ * n=1 is the default.
+ * n=N is a special case that means "1 or more".
+ *
+ * A perhaps-incomplete list of types:
+ *
+ * CephInt: Optional: range=min[|max]
+ * CephFloat: Optional range
+ * CephString: optional badchars
+ * CephSocketpath: validation involves "is it S_ISSOCK"
+ * CephIPAddr: v4 or v6 addr with optional port, syntax validated
+ * CephEntityAddr: CephIPAddr + optional '/nonce'
+ * CephPoolname: Plainold string
+ * CephObjectname: Another plainold string
+ * CephPgid: n.xxx where n is an int > 0, xxx is a hex number > 0
+ * CephName: daemon name, '*' or '<type>.<id>' (id must be int for type osd)
+ * CephOsdName: osd name, '*' or '<id> or 'osd.<id>' (id must be int)
+ * CephChoices: strings="foo|bar" means this param can be either
+ * CephFilepath: openable file
+ * CephFragment: cephfs 'fragID': val/bits, val in hex 0xnnn, bits in dec
+ * CephUUID: uuid in text matching Python uuid.UUID()
+ * CephPrefix: special type assigned to literals
+ *
+ * Example:
+ *
+ * COMMAND("auth add "
+ * "name=entity,type=CephString "
+ * "name=caps,type=CephString,n=N,req=false "
+ * "-- "
+ * "name=some_option,type=CephString,req=false",
+ * "add auth info for <name> from input file, or random key "
+ * "if no input given, and/or any caps specified in the command")
+ *
+ * defines a command "auth add" that takes a required argument "entity"
+ * of type "CephString", and from 1 to N arguments named "caps" of type
+ * CephString, at least one of which is required. The front end will
+ * validate user input against this description. Let's say the user
+ * enters auth add client.admin 'mon rwx' 'osd *'. The result will be a
+ * JSON object like {"prefix":"auth add", "entity":"client.admin",
+ * "caps":["mon rwx", "osd *"]}.
+ *
+ * The -- separates positional from non-positional (and, by implication,
+ * optional) arguments. Note that CephBool is assumed to be non-positional
+ * and will also implicitly mark that any following arguments are
+ * non-positional.
+ *
+ * Note that
+ * - string literals are accumulated into 'prefix'
+ * - n=1 descriptors are given normal string or int object values
+ * - n=N descriptors are given array values
+ *
+ * NOTE: be careful with spaces. Each descriptor must be separated by
+ * one space, no other characters, so if you split lines as above, be
+ * sure to close and reopen the quotes, and be careful to include the '
+ * separating spaces in the quoted string.
+ *
+ * The monitor marshals this JSON into a std::map<string, cmd_vartype>
+ * where cmd_vartype is a boost::variant type-enforcing discriminated
+ * type, so the monitor is expected to know the type of each argument.
+ * See cmdparse.cc/h for more details.
+ *
+ * The flag parameter for COMMAND_WITH_FLAGS macro must be passed using
+ * FLAG(f), where 'f' may be one of the following:
+ *
+ * NONE - no flag assigned
+ * NOFORWARD - command may not be forwarded
+ * OBSOLETE - command is considered obsolete
+ * DEPRECATED - command is considered deprecated
+ * MGR - command goes to ceph-mgr (for luminous+)
+ * POLL - command is intended to be called periodically by the
+ * client (see iostat)
+ * HIDDEN - command is hidden (no reported by help etc)
+ * TELL - tell/asok command. it's an alias of (NOFORWARD | HIDDEN)
+ *
+ * A command should always be first considered DEPRECATED before being
+ * considered OBSOLETE, giving due consideration to users and conforming
+ * to any guidelines regarding deprecating commands.
+ */
+
+COMMAND("pg map name=pgid,type=CephPgid", "show mapping of pg to osds", \
+ "pg", "r")
+COMMAND("pg repeer name=pgid,type=CephPgid", "force a PG to repeer",
+ "osd", "rw")
+COMMAND("osd last-stat-seq name=id,type=CephOsdName", \
+ "get the last pg stats sequence number reported for this osd", \
+ "osd", "r")
+
+/*
+ * auth commands AuthMonitor.cc
+ */
+
+COMMAND("auth export name=entity,type=CephString,req=false", \
+ "write keyring for requested entity, or master keyring if none given", \
+ "auth", "rx")
+COMMAND("auth get name=entity,type=CephString", \
+ "write keyring file with requested key", "auth", "rx")
+COMMAND("auth get-key name=entity,type=CephString", "display requested key", \
+ "auth", "rx")
+COMMAND("auth print-key name=entity,type=CephString", "display requested key", \
+ "auth", "rx")
+COMMAND("auth print_key name=entity,type=CephString", "display requested key", \
+ "auth", "rx")
+COMMAND_WITH_FLAG("auth list", "list authentication state", "auth", "rx",
+ FLAG(DEPRECATED))
+COMMAND("auth ls", "list authentication state", "auth", "rx")
+COMMAND("auth import", "auth import: read keyring file from -i <file>",
+ "auth", "rwx")
+COMMAND("auth add "
+ "name=entity,type=CephString "
+ "name=caps,type=CephString,n=N,req=false",
+ "add auth info for <entity> from input file, or random key if no "
+ "input is given, and/or any caps specified in the command",
+ "auth", "rwx")
+COMMAND("auth get-or-create-key "
+ "name=entity,type=CephString "
+ "name=caps,type=CephString,n=N,req=false",
+ "get, or add, key for <name> from system/caps pairs specified in the command. If key already exists, any given caps must match the existing caps for that key.",
+ "auth", "rwx")
+COMMAND("auth get-or-create "
+ "name=entity,type=CephString "
+ "name=caps,type=CephString,n=N,req=false",
+ "add auth info for <entity> from input file, or random key if no input given, and/or any caps specified in the command",
+ "auth", "rwx")
+COMMAND("auth get-or-create-pending "
+ "name=entity,type=CephString",
+ "generate and/or retrieve existing pending key (rotated into place on first use)",
+ "auth", "rwx")
+COMMAND("auth clear-pending "
+ "name=entity,type=CephString",
+ "clear pending key",
+ "auth", "rwx")
+COMMAND("auth commit-pending "
+ "name=entity,type=CephString",
+ "rotate pending key into active position",
+ "auth", "rwx")
+COMMAND("fs authorize "
+ "name=filesystem,type=CephString "
+ "name=entity,type=CephString "
+ "name=caps,type=CephString,n=N",
+ "add auth for <entity> to access file system <filesystem> based on following directory and permissions pairs",
+ "auth", "rwx")
+COMMAND("auth caps "
+ "name=entity,type=CephString "
+ "name=caps,type=CephString,n=N",
+ "update caps for <name> from caps specified in the command",
+ "auth", "rwx")
+COMMAND_WITH_FLAG("auth del "
+ "name=entity,type=CephString",
+ "delete all caps for <name>",
+ "auth", "rwx",
+ FLAG(DEPRECATED))
+COMMAND("auth rm "
+ "name=entity,type=CephString",
+ "remove all caps for <name>",
+ "auth", "rwx")
+
+/*
+ * Monitor commands (Monitor.cc)
+ */
+COMMAND_WITH_FLAG("compact", "cause compaction of monitor's RocksDB storage",
+ "mon", "rw",
+ FLAG(TELL))
+COMMAND("fsid", "show cluster FSID/UUID", "mon", "r")
+COMMAND("log name=logtext,type=CephString,n=N",
+ "log supplied text to the monitor log", "mon", "rw")
+COMMAND("log last "
+ "name=num,type=CephInt,range=1,req=false "
+ "name=level,type=CephChoices,strings=debug|info|sec|warn|error,req=false "
+ "name=channel,type=CephChoices,strings=*|cluster|audit|cephadm,req=false",
+ "print last few lines of the cluster log",
+ "mon", "r")
+
+COMMAND("status", "show cluster status", "mon", "r")
+COMMAND("health name=detail,type=CephChoices,strings=detail,req=false",
+ "show cluster health", "mon", "r")
+COMMAND("health mute "\
+ "name=code,type=CephString "
+ "name=ttl,type=CephString,req=false "
+ "name=sticky,type=CephBool,req=false",
+ "mute health alert", "mon", "w")
+COMMAND("health unmute "\
+ "name=code,type=CephString,req=false",
+ "unmute existing health alert mute(s)", "mon", "w")
+COMMAND("time-sync-status", "show time sync status", "mon", "r")
+COMMAND("df name=detail,type=CephChoices,strings=detail,req=false",
+ "show cluster free space stats", "mon", "r")
+COMMAND("report name=tags,type=CephString,n=N,req=false",
+ "report full status of cluster, optional title tag strings",
+ "mon", "r")
+COMMAND("features", "report of connected features",
+ "mon", "r")
+COMMAND("quorum_status", "report status of monitor quorum",
+ "mon", "r")
+COMMAND("mon ok-to-stop "
+ "name=ids,type=CephString,n=N",
+ "check whether mon(s) can be safely stopped without reducing immediate "
+ "availability",
+ "mon", "r")
+COMMAND("mon ok-to-add-offline",
+ "check whether adding a mon and not starting it would break quorum",
+ "mon", "r")
+COMMAND("mon ok-to-rm "
+ "name=id,type=CephString",
+ "check whether removing the specified mon would break quorum",
+ "mon", "r")
+
+COMMAND("tell "
+ "name=target,type=CephName "
+ "name=args,type=CephString,n=N",
+ "send a command to a specific daemon", "mon", "rw")
+COMMAND_WITH_FLAG("version", "show mon daemon version", "mon", "r",
+ FLAG(TELL))
+
+COMMAND("node ls "
+ "name=type,type=CephChoices,strings=all|osd|mon|mds|mgr,req=false",
+ "list all nodes in cluster [type]", "mon", "r")
+/*
+ * Monitor-specific commands under module 'mon'
+ */
+COMMAND_WITH_FLAG("mon scrub",
+ "scrub the monitor stores",
+ "mon", "rw",
+ FLAG(NONE))
+COMMAND("mon metadata name=id,type=CephString,req=false",
+ "fetch metadata for mon <id>",
+ "mon", "r")
+COMMAND("mon count-metadata name=property,type=CephString",
+ "count mons by metadata field property",
+ "mon", "r")
+COMMAND("mon versions",
+ "check running versions of monitors",
+ "mon", "r")
+COMMAND("versions",
+ "check running versions of ceph daemons",
+ "mon", "r")
+
+
+
+/*
+ * MDS commands (MDSMonitor.cc)
+ */
+
+#define FS_NAME_GOODCHARS "[A-Za-z0-9-_.]"
+COMMAND_WITH_FLAG("mds stat", "show MDS status", "mds", "r", FLAG(HIDDEN))
+COMMAND("mds last-seen name=id,type=CephString,req=true",
+ "fetch metadata for mds <id>",
+ "mds", "r")
+COMMAND("fs dump "
+ "name=epoch,type=CephInt,req=false,range=0",
+ "dump all CephFS status, optionally from epoch", "mds", "r")
+COMMAND("mds metadata name=who,type=CephString,req=false",
+ "fetch metadata for mds <role>",
+ "mds", "r")
+COMMAND("mds count-metadata name=property,type=CephString",
+ "count MDSs by metadata field property",
+ "mds", "r")
+COMMAND("mds versions",
+ "check running versions of MDSs",
+ "mds", "r")
+COMMAND("mds ok-to-stop name=ids,type=CephString,n=N",
+ "check whether stopping the specified MDS would reduce immediate availability",
+ "mds", "r")
+COMMAND_WITH_FLAG("mds freeze name=role_or_gid,type=CephString"
+ " name=val,type=CephString",
+ "freeze MDS yes/no", "mds", "rw", FLAG(HIDDEN))
+// arbitrary limit 0-20 below; worth standing on head to make it
+// relate to actual state definitions?
+// #include "include/ceph_fs.h"
+COMMAND_WITH_FLAG("mds set_state "
+ "name=gid,type=CephInt,range=0 "
+ "name=state,type=CephInt,range=0|20",
+ "set mds state of <gid> to <numeric-state>", "mds", "rw", FLAG(HIDDEN))
+COMMAND("mds fail name=role_or_gid,type=CephString "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "Mark MDS failed: trigger a failover if a standby is available",
+ "mds", "rw")
+COMMAND("mds repaired name=role,type=CephString",
+ "mark a damaged MDS rank as no longer damaged", "mds", "rw")
+COMMAND("mds rm "
+ "name=gid,type=CephInt,range=0",
+ "remove nonactive mds", "mds", "rw")
+COMMAND_WITH_FLAG("mds rmfailed name=role,type=CephString "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "remove failed rank", "mds", "rw", FLAG(HIDDEN))
+COMMAND_WITH_FLAG("mds compat show", "show mds compatibility settings",
+ "mds", "r", FLAG(DEPRECATED))
+COMMAND("fs compat show "
+ "name=fs_name,type=CephString ",
+ "show fs compatibility settings",
+ "mds", "r")
+COMMAND_WITH_FLAG("mds compat rm_compat "
+ "name=feature,type=CephInt,range=0",
+ "remove compatible feature", "mds", "rw", FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("mds compat rm_incompat "
+ "name=feature,type=CephInt,range=0",
+ "remove incompatible feature", "mds", "rw", FLAG(DEPRECATED))
+COMMAND("fs new "
+ "name=fs_name,type=CephString,goodchars=" FS_NAME_GOODCHARS
+ " name=metadata,type=CephString "
+ "name=data,type=CephString "
+ "name=force,type=CephBool,req=false "
+ "name=allow_dangerous_metadata_overlay,type=CephBool,req=false "
+ "name=fscid,type=CephInt,range=0,req=false "
+ "name=recover,type=CephBool,req=false",
+ "make new filesystem using named pools <metadata> and <data>",
+ "fs", "rw")
+COMMAND("fs fail "
+ "name=fs_name,type=CephString "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "bring the file system down and all of its ranks",
+ "fs", "rw")
+COMMAND("fs rm "
+ "name=fs_name,type=CephString "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "disable the named filesystem",
+ "fs", "rw")
+COMMAND("fs reset "
+ "name=fs_name,type=CephString "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "disaster recovery only: reset to a single-MDS map",
+ "fs", "rw")
+COMMAND("fs ls ",
+ "list filesystems",
+ "fs", "r")
+COMMAND("fs get name=fs_name,type=CephString",
+ "get info about one filesystem",
+ "fs", "r")
+COMMAND("fs set "
+ "name=fs_name,type=CephString "
+ "name=var,type=CephChoices,strings=max_mds"
+ "|allow_dirfrags"
+ "|allow_new_snaps"
+ "|allow_standby_replay"
+ "|bal_rank_mask"
+ "|balance_automate"
+ "|balancer"
+ "|cluster_down"
+ "|down"
+ "|inline_data"
+ "|joinable"
+ "|max_file_size"
+ "|max_xattr_size"
+ "|min_compat_client"
+ "|refuse_client_session"
+ "|refuse_standby_for_another_fs"
+ "|session_autoclose"
+ "|session_timeout"
+ "|standby_count_wanted"
+ " "
+ "name=val,type=CephString "
+ "name=yes_i_really_mean_it,type=CephBool,req=false "
+ "name=yes_i_really_really_mean_it,type=CephBool,req=false",
+ "set fs parameter <var> to <val>", "mds", "rw")
+COMMAND("fs flag set name=flag_name,type=CephChoices,strings=enable_multiple "
+ "name=val,type=CephString "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "Set a global CephFS flag",
+ "fs", "rw")
+
+COMMAND("fs feature ls",
+ "list available cephfs features to be set/unset",
+ "mds", "r")
+
+COMMAND("fs lsflags name=fs_name,type=CephString",
+ "list the flags set on a ceph filesystem",
+ "fs", "r")
+
+COMMAND("fs compat "
+ "name=fs_name,type=CephString "
+ "name=subop,type=CephChoices,strings=rm_compat|rm_incompat|add_compat|add_incompat "
+ "name=feature,type=CephInt "
+ "name=feature_str,type=CephString,req=false ",
+ "manipulate compat settings", "fs", "rw")
+
+COMMAND("fs required_client_features "
+ "name=fs_name,type=CephString "
+ "name=subop,type=CephChoices,strings=add|rm "
+ "name=val,type=CephString ",
+ "add/remove required features of clients", "mds", "rw")
+
+COMMAND("fs add_data_pool name=fs_name,type=CephString "
+ "name=pool,type=CephString",
+ "add data pool <pool>", "mds", "rw")
+COMMAND("fs rm_data_pool name=fs_name,type=CephString "
+ "name=pool,type=CephString",
+ "remove data pool <pool>", "mds", "rw")
+COMMAND_WITH_FLAG("fs set_default name=fs_name,type=CephString",
+ "set the default to the named filesystem",
+ "fs", "rw",
+ FLAG(DEPRECATED))
+COMMAND("fs set-default name=fs_name,type=CephString",
+ "set the default to the named filesystem",
+ "fs", "rw")
+COMMAND("fs mirror enable "
+ "name=fs_name,type=CephString ",
+ "enable mirroring for a ceph filesystem", "mds", "rw")
+COMMAND("fs mirror disable "
+ "name=fs_name,type=CephString ",
+ "disable mirroring for a ceph filesystem", "mds", "rw")
+COMMAND("fs mirror peer_add "
+ "name=fs_name,type=CephString "
+ "name=uuid,type=CephString "
+ "name=remote_cluster_spec,type=CephString "
+ "name=remote_fs_name,type=CephString",
+ "add a mirror peer for a ceph filesystem", "mds", "rw")
+COMMAND("fs mirror peer_remove "
+ "name=fs_name,type=CephString "
+ "name=uuid,type=CephString ",
+ "remove a mirror peer for a ceph filesystem", "mds", "rw")
+COMMAND("fs rename "
+ "name=fs_name,type=CephString "
+ "name=new_fs_name,type=CephString,goodchars=" FS_NAME_GOODCHARS
+ " name=yes_i_really_mean_it,type=CephBool,req=false",
+ "rename a ceph file system", "mds", "rw")
+COMMAND("fs swap "
+ "name=fs1_name,type=CephString "
+ "name=fs1_id,type=CephInt,range=0 "
+ "name=fs2_name,type=CephString "
+ "name=fs2_id,type=CephInt,range=0 "
+ "name=swap_fscids,type=CephChoices,strings=yes|no,req=true "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "swap ceph file system names", "mds", "rw")
+
+/*
+ * Monmap commands
+ */
+COMMAND("mon dump "
+ "name=epoch,type=CephInt,range=0,req=false",
+ "dump formatted monmap (optionally from epoch)",
+ "mon", "r")
+COMMAND("mon stat", "summarize monitor status", "mon", "r")
+COMMAND("mon getmap "
+ "name=epoch,type=CephInt,range=0,req=false",
+ "get monmap", "mon", "r")
+COMMAND("mon add "
+ "name=name,type=CephString "
+ "name=addr,type=CephIPAddr "
+ "name=location,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=],req=false",
+ "add new monitor named <name> at <addr>, possibly with CRUSH location <location>", "mon", "rw")
+COMMAND("mon rm "
+ "name=name,type=CephString",
+ "remove monitor named <name>", "mon", "rw")
+COMMAND_WITH_FLAG("mon remove "
+ "name=name,type=CephString",
+ "remove monitor named <name>", "mon", "rw",
+ FLAG(DEPRECATED))
+COMMAND("mon feature ls "
+ "name=with_value,type=CephBool,req=false",
+ "list available mon map features to be set/unset",
+ "mon", "r")
+COMMAND("mon feature set "
+ "name=feature_name,type=CephString "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "set provided feature on mon map",
+ "mon", "rw")
+COMMAND("mon set-rank "
+ "name=name,type=CephString "
+ "name=rank,type=CephInt",
+ "set the rank for the specified mon",
+ "mon", "rw")
+COMMAND("mon set-addrs "
+ "name=name,type=CephString "
+ "name=addrs,type=CephString",
+ "set the addrs (IPs and ports) a specific monitor binds to",
+ "mon", "rw")
+COMMAND("mon set-weight "
+ "name=name,type=CephString "
+ "name=weight,type=CephInt,range=0|65535",
+ "set the weight for the specified mon",
+ "mon", "rw")
+COMMAND("mon enable-msgr2",
+ "enable the msgr2 protocol on port 3300",
+ "mon", "rw")
+COMMAND("mon set election_strategy " \
+ "name=strategy,type=CephString", \
+ "set the election strategy to use; choices classic, disallow, connectivity", \
+ "mon", "rw")
+COMMAND("mon add disallowed_leader " \
+ "name=name,type=CephString", \
+ "prevent the named mon from being a leader", \
+ "mon", "rw")
+COMMAND("mon rm disallowed_leader " \
+ "name=name,type=CephString", \
+ "allow the named mon to be a leader again", \
+ "mon", "rw")
+COMMAND("mon set_location " \
+ "name=name,type=CephString "
+ "name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]",
+ "specify location <args> for the monitor <name>, using CRUSH bucket names", \
+ "mon", "rw")
+COMMAND("mon enable_stretch_mode " \
+ "name=tiebreaker_mon,type=CephString, "
+ "name=new_crush_rule,type=CephString, "
+ "name=dividing_bucket,type=CephString, ",
+ "enable stretch mode, changing the peering rules and "
+ "failure handling on all pools with <tiebreaker_mon> "
+ "as the tiebreaker and setting <dividing_bucket> locations "
+ "as the units for stretching across",
+ "mon", "rw")
+COMMAND("mon set_new_tiebreaker " \
+ "name=name,type=CephString "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "switch the stretch tiebreaker to be the named mon", \
+ "mon", "rw")
+
+/*
+ * OSD commands
+ */
+COMMAND("osd stat", "print summary of OSD map", "osd", "r")
+COMMAND("osd dump "
+ "name=epoch,type=CephInt,range=0,req=false",
+ "print summary of OSD map", "osd", "r")
+COMMAND("osd info "
+ "name=id,type=CephOsdName,req=false",
+ "print osd's {id} information (instead of all osds from map)",
+ "osd", "r")
+COMMAND("osd tree "
+ "name=epoch,type=CephInt,range=0,req=false "
+ "name=states,type=CephChoices,strings=up|down|in|out|destroyed,n=N,req=false",
+ "print OSD tree", "osd", "r")
+COMMAND("osd tree-from "
+ "name=epoch,type=CephInt,range=0,req=false "
+ "name=bucket,type=CephString "
+ "name=states,type=CephChoices,strings=up|down|in|out|destroyed,n=N,req=false",
+ "print OSD tree in bucket", "osd", "r")
+COMMAND("osd ls "
+ "name=epoch,type=CephInt,range=0,req=false",
+ "show all OSD ids", "osd", "r")
+COMMAND("osd getmap "
+ "name=epoch,type=CephInt,range=0,req=false",
+ "get OSD map", "osd", "r")
+COMMAND("osd getcrushmap "
+ "name=epoch,type=CephInt,range=0,req=false",
+ "get CRUSH map", "osd", "r")
+COMMAND("osd getmaxosd", "show largest OSD id", "osd", "r")
+COMMAND("osd ls-tree "
+ "name=epoch,type=CephInt,range=0,req=false "
+ "name=name,type=CephString,req=true",
+ "show OSD ids under bucket <name> in the CRUSH map",
+ "osd", "r")
+COMMAND("osd find "
+ "name=id,type=CephOsdName",
+ "find osd <id> in the CRUSH map and show its location",
+ "osd", "r")
+COMMAND("osd metadata "
+ "name=id,type=CephOsdName,req=false",
+ "fetch metadata for osd {id} (default all)",
+ "osd", "r")
+COMMAND("osd count-metadata name=property,type=CephString",
+ "count OSDs by metadata field property",
+ "osd", "r")
+COMMAND("osd versions",
+ "check running versions of OSDs",
+ "osd", "r")
+COMMAND("osd numa-status",
+ "show NUMA status of OSDs",
+ "osd", "r")
+COMMAND("osd map "
+ "name=pool,type=CephPoolname "
+ "name=object,type=CephObjectname "
+ "name=nspace,type=CephString,req=false",
+ "find pg for <object> in <pool> with [namespace]", "osd", "r")
+COMMAND_WITH_FLAG("osd lspools",
+ "list pools", "osd", "r", FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd crush rule list", "list crush rules", "osd", "r",
+ FLAG(DEPRECATED))
+COMMAND("osd crush rule ls", "list crush rules", "osd", "r")
+COMMAND("osd crush rule ls-by-class "
+ "name=class,type=CephString,goodchars=[A-Za-z0-9-_.]",
+ "list all crush rules that reference the same <class>",
+ "osd", "r")
+COMMAND("osd crush rule dump "
+ "name=name,type=CephString,goodchars=[A-Za-z0-9-_.],req=false",
+ "dump crush rule <name> (default all)",
+ "osd", "r")
+COMMAND("osd crush dump",
+ "dump crush map",
+ "osd", "r")
+COMMAND("osd setcrushmap name=prior_version,type=CephInt,req=false",
+ "set crush map from input file",
+ "osd", "rw")
+COMMAND("osd crush set name=prior_version,type=CephInt,req=false",
+ "set crush map from input file",
+ "osd", "rw")
+COMMAND("osd crush add-bucket "
+ "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=type,type=CephString "
+ "name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=],req=false",
+ "add no-parent (probably root) crush bucket <name> of type <type> "
+ "to location <args>",
+ "osd", "rw")
+COMMAND("osd crush rename-bucket "
+ "name=srcname,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=dstname,type=CephString,goodchars=[A-Za-z0-9-_.]",
+ "rename bucket <srcname> to <dstname>",
+ "osd", "rw")
+COMMAND("osd crush set "
+ "name=id,type=CephOsdName "
+ "name=weight,type=CephFloat,range=0.0 "
+ "name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]",
+ "update crushmap position and weight for <name> to <weight> with location <args>",
+ "osd", "rw")
+COMMAND("osd crush add "
+ "name=id,type=CephOsdName "
+ "name=weight,type=CephFloat,range=0.0 "
+ "name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]",
+ "add or update crushmap position and weight for <name> with <weight> and location <args>",
+ "osd", "rw")
+COMMAND("osd crush set-all-straw-buckets-to-straw2",
+ "convert all CRUSH current straw buckets to use the straw2 algorithm",
+ "osd", "rw")
+COMMAND("osd crush class create "
+ "name=class,type=CephString,goodchars=[A-Za-z0-9-_]",
+ "create crush device class <class>",
+ "osd", "rw")
+COMMAND("osd crush class rm "
+ "name=class,type=CephString,goodchars=[A-Za-z0-9-_]",
+ "remove crush device class <class>",
+ "osd", "rw")
+COMMAND("osd crush set-device-class "
+ "name=class,type=CephString "
+ "name=ids,type=CephString,n=N",
+ "set the <class> of the osd(s) <id> [<id>...],"
+ "or use <all|any> to set all.",
+ "osd", "rw")
+COMMAND("osd crush rm-device-class "
+ "name=ids,type=CephString,n=N",
+ "remove class of the osd(s) <id> [<id>...],"
+ "or use <all|any> to remove all.",
+ "osd", "rw")
+COMMAND("osd crush class rename "
+ "name=srcname,type=CephString,goodchars=[A-Za-z0-9-_] "
+ "name=dstname,type=CephString,goodchars=[A-Za-z0-9-_]",
+ "rename crush device class <srcname> to <dstname>",
+ "osd", "rw")
+COMMAND("osd crush create-or-move "
+ "name=id,type=CephOsdName "
+ "name=weight,type=CephFloat,range=0.0 "
+ "name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]",
+ "create entry or move existing entry for <name> <weight> at/to location <args>",
+ "osd", "rw")
+COMMAND("osd crush move "
+ "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]",
+ "move existing entry for <name> to location <args>",
+ "osd", "rw")
+COMMAND("osd crush swap-bucket "
+ "name=source,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=dest,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "swap existing bucket contents from (orphan) bucket <source> and <target>",
+ "osd", "rw")
+COMMAND("osd crush link "
+ "name=name,type=CephString "
+ "name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]",
+ "link existing entry for <name> under location <args>",
+ "osd", "rw")
+COMMAND("osd crush rm "
+ "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=ancestor,type=CephString,req=false,goodchars=[A-Za-z0-9-_.]",
+ "remove <name> from crush map (everywhere, or just at <ancestor>)",\
+ "osd", "rw")
+COMMAND_WITH_FLAG("osd crush remove "
+ "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=ancestor,type=CephString,req=false,goodchars=[A-Za-z0-9-_.]",
+ "remove <name> from crush map (everywhere, or just at <ancestor>)",
+ "osd", "rw",
+ FLAG(DEPRECATED))
+COMMAND("osd crush unlink "
+ "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=ancestor,type=CephString,req=false,goodchars=[A-Za-z0-9-_.]",
+ "unlink <name> from crush map (everywhere, or just at <ancestor>)",
+ "osd", "rw")
+COMMAND("osd crush reweight-all",
+ "recalculate the weights for the tree to ensure they sum correctly",
+ "osd", "rw")
+COMMAND("osd crush reweight "
+ "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=weight,type=CephFloat,range=0.0",
+ "change <name>'s weight to <weight> in crush map",
+ "osd", "rw")
+COMMAND("osd crush reweight-subtree "
+ "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=weight,type=CephFloat,range=0.0",
+ "change all leaf items beneath <name> to <weight> in crush map",
+ "osd", "rw")
+COMMAND("osd crush tunables "
+ "name=profile,type=CephChoices,strings=legacy|argonaut|bobtail|firefly|hammer|jewel|optimal|default",
+ "set crush tunables values to <profile>", "osd", "rw")
+COMMAND("osd crush set-tunable "
+ "name=tunable,type=CephChoices,strings=straw_calc_version "
+ "name=value,type=CephInt",
+ "set crush tunable <tunable> to <value>",
+ "osd", "rw")
+COMMAND("osd crush get-tunable "
+ "name=tunable,type=CephChoices,strings=straw_calc_version",
+ "get crush tunable <tunable>",
+ "osd", "r")
+COMMAND("osd crush show-tunables",
+ "show current crush tunables", "osd", "r")
+COMMAND("osd crush rule create-simple "
+ "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=root,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=type,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=mode,type=CephChoices,strings=firstn|indep,req=false",
+ "create crush rule <name> to start from <root>, replicate across buckets of type <type>, using a choose mode of <firstn|indep> (default firstn; indep best for erasure pools)",
+ "osd", "rw")
+COMMAND("osd crush rule create-replicated "
+ "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=root,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=type,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=class,type=CephString,goodchars=[A-Za-z0-9-_.],req=false",
+ "create crush rule <name> for replicated pool to start from <root>, replicate across buckets of type <type>, use devices of type <class> (ssd or hdd)",
+ "osd", "rw")
+COMMAND("osd crush rule create-erasure "
+ "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=profile,type=CephString,req=false,goodchars=[A-Za-z0-9-_.=]",
+ "create crush rule <name> for erasure coded pool created with <profile> (default default)",
+ "osd", "rw")
+COMMAND("osd crush rule rm "
+ "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] ",
+ "remove crush rule <name>", "osd", "rw")
+COMMAND("osd crush rule rename "
+ "name=srcname,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=dstname,type=CephString,goodchars=[A-Za-z0-9-_.]",
+ "rename crush rule <srcname> to <dstname>",
+ "osd", "rw")
+COMMAND("osd crush tree "
+ "name=show_shadow,type=CephBool,req=false",
+ "dump crush buckets and items in a tree view",
+ "osd", "r")
+COMMAND("osd crush ls name=node,type=CephString,goodchars=[A-Za-z0-9-_.]",
+ "list items beneath a node in the CRUSH tree",
+ "osd", "r")
+COMMAND("osd crush class ls",
+ "list all crush device classes",
+ "osd", "r")
+COMMAND("osd crush class ls-osd "
+ "name=class,type=CephString,goodchars=[A-Za-z0-9-_]",
+ "list all osds belonging to the specific <class>",
+ "osd", "r")
+COMMAND("osd crush get-device-class "
+ "name=ids,type=CephString,n=N",
+ "get classes of specified osd(s) <id> [<id>...]",
+ "osd", "r")
+COMMAND("osd crush weight-set ls",
+ "list crush weight sets",
+ "osd", "r")
+COMMAND("osd crush weight-set dump",
+ "dump crush weight sets",
+ "osd", "r")
+COMMAND("osd crush weight-set create-compat",
+ "create a default backward-compatible weight-set",
+ "osd", "rw")
+COMMAND("osd crush weight-set create "
+ "name=pool,type=CephPoolname "\
+ "name=mode,type=CephChoices,strings=flat|positional",
+ "create a weight-set for a given pool",
+ "osd", "rw")
+COMMAND("osd crush weight-set rm name=pool,type=CephPoolname",
+ "remove the weight-set for a given pool",
+ "osd", "rw")
+COMMAND("osd crush weight-set rm-compat",
+ "remove the backward-compatible weight-set",
+ "osd", "rw")
+COMMAND("osd crush weight-set reweight "
+ "name=pool,type=CephPoolname "
+ "name=item,type=CephString "
+ "name=weight,type=CephFloat,range=0.0,n=N",
+ "set weight for an item (bucket or osd) in a pool's weight-set",
+ "osd", "rw")
+COMMAND("osd crush weight-set reweight-compat "
+ "name=item,type=CephString "
+ "name=weight,type=CephFloat,range=0.0,n=N",
+ "set weight for an item (bucket or osd) in the backward-compatible weight-set",
+ "osd", "rw")
+COMMAND("osd setmaxosd "
+ "name=newmax,type=CephInt,range=0",
+ "set new maximum osd value", "osd", "rw")
+COMMAND("osd set-full-ratio "
+ "name=ratio,type=CephFloat,range=0.0|1.0",
+ "set usage ratio at which OSDs are marked full",
+ "osd", "rw")
+COMMAND("osd set-backfillfull-ratio "
+ "name=ratio,type=CephFloat,range=0.0|1.0",
+ "set usage ratio at which OSDs are marked too full to backfill",
+ "osd", "rw")
+COMMAND("osd set-nearfull-ratio "
+ "name=ratio,type=CephFloat,range=0.0|1.0",
+ "set usage ratio at which OSDs are marked near-full",
+ "osd", "rw")
+COMMAND("osd get-require-min-compat-client",
+ "get the minimum client version we will maintain compatibility with",
+ "osd", "r")
+COMMAND("osd set-require-min-compat-client "
+ "name=version,type=CephString "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "set the minimum client version we will maintain compatibility with",
+ "osd", "rw")
+COMMAND("osd pause", "pause osd", "osd", "rw")
+COMMAND("osd unpause", "unpause osd", "osd", "rw")
+COMMAND("osd erasure-code-profile set "
+ "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=profile,type=CephString,n=N,req=false "
+ "name=force,type=CephBool,req=false "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "create erasure code profile <name> with [<key[=value]> ...] pairs. Add a --force at the end to override an existing profile (VERY DANGEROUS)",
+ "osd", "rw")
+COMMAND("osd erasure-code-profile get "
+ "name=name,type=CephString,goodchars=[A-Za-z0-9-_.]",
+ "get erasure code profile <name>",
+ "osd", "r")
+COMMAND("osd erasure-code-profile rm "
+ "name=name,type=CephString,goodchars=[A-Za-z0-9-_.]",
+ "remove erasure code profile <name>",
+ "osd", "rw")
+COMMAND("osd erasure-code-profile ls",
+ "list all erasure code profiles",
+ "osd", "r")
+COMMAND("osd set "
+ "name=key,type=CephChoices,strings=full|pause|noup|nodown|"
+ "noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|"
+ "notieragent|nosnaptrim|pglog_hardlimit|noautoscale "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "set <key>", "osd", "rw")
+COMMAND("osd unset "
+ "name=key,type=CephChoices,strings=full|pause|noup|nodown|"\
+ "noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|"
+ "notieragent|nosnaptrim|noautoscale",
+ "unset <key>", "osd", "rw")
+COMMAND("osd require-osd-release "\
+ "name=release,type=CephChoices,strings=octopus|pacific|quincy|reef|squid "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "set the minimum allowed OSD release to participate in the cluster",
+ "osd", "rw")
+COMMAND("osd down "
+ "name=ids,type=CephString,n=N "
+ "name=definitely_dead,type=CephBool,req=false",
+ "set osd(s) <id> [<id>...] down, "
+ "or use <any|all> to set all osds down",
+ "osd", "rw")
+COMMAND("osd stop "
+ "type=CephString,name=ids,n=N",
+ "stop the corresponding osd daemons and mark them as down",
+ "osd", "rw")
+COMMAND("osd out "
+ "name=ids,type=CephString,n=N",
+ "set osd(s) <id> [<id>...] out, "
+ "or use <any|all> to set all osds out",
+ "osd", "rw")
+COMMAND("osd in "
+ "name=ids,type=CephString,n=N",
+ "set osd(s) <id> [<id>...] in, "
+ "can use <any|all> to automatically set all previously out osds in",
+ "osd", "rw")
+COMMAND_WITH_FLAG("osd rm "
+ "name=ids,type=CephString,n=N",
+ "remove osd(s) <id> [<id>...], "
+ "or use <any|all> to remove all osds",
+ "osd", "rw",
+ FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd add-noup "
+ "name=ids,type=CephString,n=N",
+ "mark osd(s) <id> [<id>...] as noup, "
+ "or use <all|any> to mark all osds as noup",
+ "osd", "rw",
+ FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd add-nodown "
+ "name=ids,type=CephString,n=N",
+ "mark osd(s) <id> [<id>...] as nodown, "
+ "or use <all|any> to mark all osds as nodown",
+ "osd", "rw",
+ FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd add-noin "
+ "name=ids,type=CephString,n=N",
+ "mark osd(s) <id> [<id>...] as noin, "
+ "or use <all|any> to mark all osds as noin",
+ "osd", "rw",
+ FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd add-noout "
+ "name=ids,type=CephString,n=N",
+ "mark osd(s) <id> [<id>...] as noout, "
+ "or use <all|any> to mark all osds as noout",
+ "osd", "rw",
+ FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd rm-noup "
+ "name=ids,type=CephString,n=N",
+ "allow osd(s) <id> [<id>...] to be marked up "
+ "(if they are currently marked as noup), "
+ "can use <all|any> to automatically filter out all noup osds",
+ "osd", "rw",
+ FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd rm-nodown "
+ "name=ids,type=CephString,n=N",
+ "allow osd(s) <id> [<id>...] to be marked down "
+ "(if they are currently marked as nodown), "
+ "can use <all|any> to automatically filter out all nodown osds",
+ "osd", "rw",
+ FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd rm-noin "
+ "name=ids,type=CephString,n=N",
+ "allow osd(s) <id> [<id>...] to be marked in "
+ "(if they are currently marked as noin), "
+ "can use <all|any> to automatically filter out all noin osds",
+ "osd", "rw",
+ FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd rm-noout "
+ "name=ids,type=CephString,n=N",
+ "allow osd(s) <id> [<id>...] to be marked out "
+ "(if they are currently marked as noout), "
+ "can use <all|any> to automatically filter out all noout osds",
+ "osd", "rw",
+ FLAG(DEPRECATED))
+COMMAND("osd set-group "
+ "name=flags,type=CephString "
+ "name=who,type=CephString,n=N",
+ "set <flags> for batch osds or crush nodes, "
+ "<flags> must be a comma-separated subset of {noup,nodown,noin,noout}",
+ "osd", "rw")
+COMMAND("osd unset-group "
+ "name=flags,type=CephString "
+ "name=who,type=CephString,n=N",
+ "unset <flags> for batch osds or crush nodes, "
+ "<flags> must be a comma-separated subset of {noup,nodown,noin,noout}",
+ "osd", "rw")
+COMMAND("osd reweight "
+ "name=id,type=CephOsdName "
+ "type=CephFloat,name=weight,range=0.0|1.0",
+ "reweight osd to 0.0 < <weight> < 1.0", "osd", "rw")
+COMMAND("osd reweightn "
+ "name=weights,type=CephString",
+ "reweight osds with {<id>: <weight>,...}",
+ "osd", "rw")
+COMMAND("osd force-create-pg "
+ "name=pgid,type=CephPgid "\
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "force creation of pg <pgid>",
+ "osd", "rw")
+COMMAND("osd pg-temp "
+ "name=pgid,type=CephPgid "
+ "name=id,type=CephOsdName,n=N,req=false",
+ "set pg_temp mapping <pgid>:[<id> [<id>...]] (developers only)",
+ "osd", "rw")
+COMMAND("osd pg-upmap "
+ "name=pgid,type=CephPgid "
+ "name=id,type=CephOsdName,n=N",
+ "set pg_upmap mapping <pgid>:[<id> [<id>...]] (developers only)",
+ "osd", "rw")
+COMMAND("osd rm-pg-upmap "
+ "name=pgid,type=CephPgid",
+ "clear pg_upmap mapping for <pgid> (developers only)",
+ "osd", "rw")
+COMMAND("osd pg-upmap-items "
+ "name=pgid,type=CephPgid "
+ "name=id,type=CephOsdName,n=N",
+ "set pg_upmap_items mapping <pgid>:{<id> to <id>, [...]} (developers only)",
+ "osd", "rw")
+COMMAND("osd rm-pg-upmap-items "
+ "name=pgid,type=CephPgid",
+ "clear pg_upmap_items mapping for <pgid> (developers only)",
+ "osd", "rw")
+COMMAND("osd pg-upmap-primary "
+ "name=pgid,type=CephPgid "
+ "name=id,type=CephOsdName ",
+ "set pg primary osd <pgid>:<id> (id (osd) must be part of pgid)",
+ "osd", "rw")
+COMMAND("osd rm-pg-upmap-primary "
+ "name=pgid,type=CephPgid ",
+ "clear pg primary setting for <pgid>",
+ "osd", "rw")
+COMMAND("osd primary-temp "
+ "name=pgid,type=CephPgid "
+ "name=id,type=CephOsdName",
+ "set primary_temp mapping pgid:<id> (developers only)",
+ "osd", "rw")
+COMMAND("osd rm-primary-temp "
+ "name=pgid,type=CephPgid ",
+ "clear primary_temp mapping pgid (developers only)",
+ "osd", "rw")
+COMMAND("osd primary-affinity "
+ "name=id,type=CephOsdName "
+ "type=CephFloat,name=weight,range=0.0|1.0",
+ "adjust osd primary-affinity from 0.0 <= <weight> <= 1.0",
+ "osd", "rw")
+COMMAND_WITH_FLAG("osd destroy-actual "
+ "name=id,type=CephOsdName "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "mark osd as being destroyed. Keeps the ID intact (allowing reuse), "
+ "but removes cephx keys, config-key data and lockbox keys, "\
+ "rendering data permanently unreadable.",
+ "osd", "rw", FLAG(HIDDEN))
+COMMAND("osd purge-new "
+ "name=id,type=CephOsdName "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "purge all traces of an OSD that was partially created but never "
+ "started",
+ "osd", "rw")
+COMMAND_WITH_FLAG("osd purge-actual "
+ "name=id,type=CephOsdName "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "purge all osd data from the monitors. Combines `osd destroy`, "
+ "`osd rm`, and `osd crush rm`.",
+ "osd", "rw", FLAG(HIDDEN))
+COMMAND("osd lost "
+ "name=id,type=CephOsdName "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "mark osd as permanently lost. THIS DESTROYS DATA IF NO MORE REPLICAS EXIST, BE CAREFUL",
+ "osd", "rw")
+COMMAND_WITH_FLAG("osd create "
+ "name=uuid,type=CephUUID,req=false "
+ "name=id,type=CephOsdName,req=false",
+ "create new osd (with optional UUID and ID)", "osd", "rw",
+ FLAG(DEPRECATED))
+COMMAND("osd new "
+ "name=uuid,type=CephUUID,req=true "
+ "name=id,type=CephOsdName,req=false",
+ "Create a new OSD. If supplied, the `id` to be replaced needs to "
+ "exist and have been previously destroyed. "
+ "Reads secrets from JSON file via `-i <file>` (see man page).",
+ "osd", "rw")
+COMMAND("osd blocklist "
+ "name=range,type=CephString,goodchars=[range],req=false "
+ "name=blocklistop,type=CephChoices,strings=add|rm "
+ "name=addr,type=CephEntityAddr "
+ "name=expire,type=CephFloat,range=0.0,req=false",
+ "add (optionally until <expire> seconds from now) or remove <addr> from blocklist",
+ "osd", "rw")
+COMMAND("osd blocklist ls", "show blocklisted clients", "osd", "r")
+COMMAND("osd blocklist clear", "clear all blocklisted clients", "osd", "rw")
+
+COMMAND_WITH_FLAG("osd blacklist "
+ "name=blacklistop,type=CephChoices,strings=add|rm "
+ "name=addr,type=CephEntityAddr "
+ "name=expire,type=CephFloat,range=0.0,req=false",
+ "add (optionally until <expire> seconds from now) or remove <addr> from blacklist",
+ "osd", "rw",
+ FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd blacklist ls", "show blacklisted clients", "osd", "r",
+ FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd blacklist clear", "clear all blacklisted clients", "osd", "rw",
+ FLAG(DEPRECATED))
+
+COMMAND("osd pool mksnap "
+ "name=pool,type=CephPoolname "
+ "name=snap,type=CephString",
+ "make snapshot <snap> in <pool>", "osd", "rw")
+COMMAND("osd pool rmsnap "
+ "name=pool,type=CephPoolname "
+ "name=snap,type=CephString",
+ "remove snapshot <snap> from <pool>", "osd", "rw")
+COMMAND("osd pool ls "
+ "name=detail,type=CephChoices,strings=detail,req=false",
+ "list pools", "osd", "r")
+COMMAND("osd pool create "
+ "name=pool,type=CephPoolname "
+ "name=pg_num,type=CephInt,range=0,req=false "
+ "name=pgp_num,type=CephInt,range=0,req=false "
+ "name=pool_type,type=CephChoices,strings=replicated|erasure,req=false "
+ "name=erasure_code_profile,type=CephString,req=false,goodchars=[A-Za-z0-9-_.] "
+ "name=rule,type=CephString,req=false "
+ "name=expected_num_objects,type=CephInt,range=0,req=false "
+ "name=size,type=CephInt,range=0,req=false "
+ "name=pg_num_min,type=CephInt,range=0,req=false "
+ "name=pg_num_max,type=CephInt,range=0,req=false "
+ "name=autoscale_mode,type=CephChoices,strings=on|off|warn,req=false "
+ "name=bulk,type=CephBool,req=false "
+ "name=target_size_bytes,type=CephInt,range=0,req=false "
+ "name=target_size_ratio,type=CephFloat,range=0.0,req=false "\
+ "name=yes_i_really_mean_it,type=CephBool,req=false"
+ "name=crimson,type=CephBool,req=false",
+ "create pool", "osd", "rw")
+COMMAND_WITH_FLAG("osd pool delete "
+ "name=pool,type=CephPoolname "
+ "name=pool2,type=CephPoolname,req=false "
+ "name=yes_i_really_really_mean_it,type=CephBool,req=false "
+ "name=yes_i_really_really_mean_it_not_faking,type=CephBool,req=false ",
+ "delete pool",
+ "osd", "rw",
+ FLAG(DEPRECATED))
+COMMAND("osd pool rm "
+ "name=pool,type=CephPoolname "
+ "name=pool2,type=CephPoolname,req=false "
+ "name=yes_i_really_really_mean_it,type=CephBool,req=false "
+ "name=yes_i_really_really_mean_it_not_faking,type=CephBool,req=false ",
+ "remove pool",
+ "osd", "rw")
+COMMAND("osd pool rename "
+ "name=srcpool,type=CephPoolname "
+ "name=destpool,type=CephPoolname "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "rename <srcpool> to <destpool>", "osd", "rw")
+COMMAND("osd pool get "
+ "name=pool,type=CephPoolname "
+ "name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk|read_ratio",
+ "get pool parameter <var>", "osd", "r")
+COMMAND("osd pool set "
+ "name=pool,type=CephPoolname "
+ "name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|pgp_num_actual|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk|read_ratio "
+ "name=val,type=CephString "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "set pool parameter <var> to <val>", "osd", "rw")
+// 'val' is a CephString because it can include a unit. Perhaps
+// there should be a Python type for validation/conversion of strings
+// with units.
+COMMAND("osd pool set-quota "
+ "name=pool,type=CephPoolname "
+ "name=field,type=CephChoices,strings=max_objects|max_bytes "
+ "name=val,type=CephString",
+ "set object or byte limit on pool", "osd", "rw")
+COMMAND("osd pool get-quota "
+ "name=pool,type=CephPoolname ",
+ "obtain object or byte limits for pool",
+ "osd", "r")
+COMMAND("osd pool application enable "
+ "name=pool,type=CephPoolname "
+ "name=app,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "enable use of an application <app> [cephfs,rbd,rgw] on pool <poolname>",
+ "osd", "rw")
+COMMAND("osd pool application disable "
+ "name=pool,type=CephPoolname "
+ "name=app,type=CephString "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "disables use of an application <app> on pool <poolname>",
+ "osd", "rw")
+COMMAND("osd pool application set "
+ "name=pool,type=CephPoolname "
+ "name=app,type=CephString "
+ "name=key,type=CephString,goodchars=[A-Za-z0-9-_.] "
+ "name=value,type=CephString,goodchars=[A-Za-z0-9-_.=]",
+ "sets application <app> metadata key <key> to <value> on pool <poolname>",
+ "osd", "rw")
+COMMAND("osd pool application rm "
+ "name=pool,type=CephPoolname "
+ "name=app,type=CephString "
+ "name=key,type=CephString",
+ "removes application <app> metadata key <key> on pool <poolname>",
+ "osd", "rw")
+COMMAND("osd pool application get "
+ "name=pool,type=CephPoolname,req=fasle "
+ "name=app,type=CephString,req=false "
+ "name=key,type=CephString,req=false",
+ "get value of key <key> of application <app> on pool <poolname>",
+ "osd", "r")
+COMMAND("osd utilization",
+ "get basic pg distribution stats",
+ "osd", "r")
+COMMAND("osd force_healthy_stretch_mode " \
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "force a healthy stretch mode, requiring the full number of CRUSH buckets "
+ "to peer and letting all non-tiebreaker monitors be elected leader ",
+ "osd", "rw")
+COMMAND("osd force_recovery_stretch_mode " \
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "try and force a recovery stretch mode, increasing the "
+ "pool size to its non-failure value if currently degraded and "
+ "all monitor buckets are up",
+ "osd", "rw")
+COMMAND("osd set-allow-crimson " \
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "Allow crimson-osds to boot and join the cluster. Note, crimson-osd is "
+ "not yet considered stable and may crash or cause data loss -- should "
+ "be avoided outside of testing and development. This setting is "
+ "irrevocable",
+ "osd", "rw")
+
+
+// tiering
+COMMAND("osd tier add "
+ "name=pool,type=CephPoolname "
+ "name=tierpool,type=CephPoolname "
+ "name=force_nonempty,type=CephBool,req=false",
+ "add the tier <tierpool> (the second one) to base pool <pool> (the first one)",
+ "osd", "rw")
+COMMAND("osd tier rm "
+ "name=pool,type=CephPoolname "
+ "name=tierpool,type=CephPoolname",
+ "remove the tier <tierpool> (the second one) from base pool <pool> (the first one)",
+ "osd", "rw")
+COMMAND_WITH_FLAG("osd tier remove "
+ "name=pool,type=CephPoolname "
+ "name=tierpool,type=CephPoolname",
+ "remove the tier <tierpool> (the second one) from base pool <pool> (the first one)",
+ "osd", "rw",
+ FLAG(DEPRECATED))
+COMMAND("osd tier cache-mode "
+ "name=pool,type=CephPoolname "
+ "name=mode,type=CephChoices,strings=writeback|proxy|readproxy|readonly|none "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "specify the caching mode for cache tier <pool>", "osd", "rw")
+COMMAND("osd tier set-overlay "
+ "name=pool,type=CephPoolname "
+ "name=overlaypool,type=CephPoolname",
+ "set the overlay pool for base pool <pool> to be <overlaypool>", "osd", "rw")
+COMMAND("osd tier rm-overlay "
+ "name=pool,type=CephPoolname ",
+ "remove the overlay pool for base pool <pool>", "osd", "rw")
+COMMAND_WITH_FLAG("osd tier remove-overlay "
+ "name=pool,type=CephPoolname ",
+ "remove the overlay pool for base pool <pool>", "osd", "rw",
+ FLAG(DEPRECATED))
+
+COMMAND("osd tier add-cache "
+ "name=pool,type=CephPoolname "
+ "name=tierpool,type=CephPoolname "
+ "name=size,type=CephInt,range=0",
+ "add a cache <tierpool> (the second one) of size <size> to existing pool <pool> (the first one)",
+ "osd", "rw")
+
+/*
+ * mon/KVMonitor.cc
+ */
+
+COMMAND("config-key get "
+ "name=key,type=CephString",
+ "get <key>", "config-key", "r")
+COMMAND("config-key set "
+ "name=key,type=CephString "
+ "name=val,type=CephString,req=false",
+ "set <key> to value <val>", "config-key", "rw")
+COMMAND_WITH_FLAG("config-key put "
+ "name=key,type=CephString "
+ "name=val,type=CephString,req=false",
+ "put <key>, value <val>", "config-key", "rw",
+ FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("config-key del "
+ "name=key,type=CephString",
+ "delete <key>", "config-key", "rw",
+ FLAG(DEPRECATED))
+COMMAND("config-key rm "
+ "name=key,type=CephString",
+ "rm <key>", "config-key", "rw")
+COMMAND("config-key exists "
+ "name=key,type=CephString",
+ "check for <key>'s existence", "config-key", "r")
+COMMAND_WITH_FLAG("config-key list ", "list keys", "config-key", "r",
+ FLAG(DEPRECATED))
+COMMAND("config-key ls ", "list keys", "config-key", "r")
+COMMAND("config-key dump "
+ "name=key,type=CephString,req=false", "dump keys and values (with optional prefix)", "config-key", "r")
+
+
+/*
+ * mon/MgrMonitor.cc
+ */
+COMMAND("mgr stat",
+ "dump basic info about the mgr cluster state",
+ "mgr", "r")
+COMMAND("mgr dump "
+ "name=epoch,type=CephInt,range=0,req=false",
+ "dump the latest MgrMap",
+ "mgr", "r")
+COMMAND("mgr set "
+ "name=var,type=CephChoices,strings=down "
+ "name=val,type=CephString ",
+ "set mgr parameter <var> to <val>", "mgr", "rw")
+COMMAND("mgr fail name=who,type=CephString,req=false",
+ "treat the named manager daemon as failed", "mgr", "rw")
+COMMAND("mgr module ls",
+ "list active mgr modules", "mgr", "r")
+COMMAND("mgr services",
+ "list service endpoints provided by mgr modules",
+ "mgr", "r")
+COMMAND("mgr module enable "
+ "name=module,type=CephString "
+ "name=force,type=CephBool,req=false",
+ "enable mgr module", "mgr", "rw")
+COMMAND("mgr module disable "
+ "name=module,type=CephString",
+ "disable mgr module", "mgr", "rw")
+COMMAND("mgr metadata name=who,type=CephString,req=false",
+ "dump metadata for all daemons or a specific daemon",
+ "mgr", "r")
+COMMAND("mgr count-metadata name=property,type=CephString",
+ "count ceph-mgr daemons by metadata field property",
+ "mgr", "r")
+COMMAND("mgr versions",
+ "check running versions of ceph-mgr daemons",
+ "mgr", "r")
+
+// ConfigMonitor
+COMMAND("config set"
+ " name=who,type=CephString"
+ " name=name,type=CephString"
+ " name=value,type=CephString"
+ " name=force,type=CephBool,req=false",
+ "Set a configuration option for one or more entities",
+ "config", "rw")
+COMMAND("config rm"
+ " name=who,type=CephString"
+ " name=name,type=CephString",
+ "Clear a configuration option for one or more entities",
+ "config", "rw")
+COMMAND("config get "
+ "name=who,type=CephString "
+ "name=key,type=CephString,req=false",
+ "Show configuration option(s) for an entity",
+ "config", "r")
+COMMAND("config dump",
+ "Show all configuration option(s)",
+ "mon", "r")
+COMMAND("config help "
+ "name=key,type=CephString",
+ "Describe a configuration option",
+ "config", "r")
+COMMAND("config ls",
+ "List available configuration options",
+ "config", "r")
+COMMAND("config assimilate-conf",
+ "Assimilate options from a conf, and return a new, minimal conf file",
+ "config", "rw")
+COMMAND("config log name=num,type=CephInt,req=false",
+ "Show recent history of config changes",
+ "config", "r")
+COMMAND("config reset "
+ "name=num,type=CephInt,range=0",
+ "Revert configuration to a historical version specified by <num>",
+ "config", "rw")
+COMMAND("config generate-minimal-conf",
+ "Generate a minimal ceph.conf file",
+ "config", "r")
+
+
+
+
+// these are tell commands that were implemented as CLI commands in
+// the broken pre-octopus way that we want to allow to work when a
+// monitor has upgraded to octopus+ but the monmap min_mon_release is
+// still < octopus. we exclude things that weren't well supported
+// before and that aren't implemented by the octopus mon anymore.
+//
+// the command set below matches the kludge in Monitor::handle_command
+// that shunts these off to the asok machinery.
+
+COMMAND_WITH_FLAG("injectargs "
+ "name=injected_args,type=CephString,n=N",
+ "inject config arguments into monitor", "mon", "rw",
+ FLAG(TELL))
+COMMAND_WITH_FLAG("smart name=devid,type=CephString,req=false",
+ "Query health metrics for underlying device",
+ "mon", "rw",
+ FLAG(TELL))
+COMMAND_WITH_FLAG("mon_status",
+ "report status of monitors",
+ "mon", "r",
+ FLAG(TELL))
+COMMAND_WITH_FLAG("heap "
+ "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats "
+ "name=value,type=CephString,req=false",
+ "show heap usage info (available only if compiled with tcmalloc)",
+ "mon", "rw",
+ FLAG(TELL))
+COMMAND_WITH_FLAG("connection scores dump",
+ "show the scores used in connectivity-based elections",
+ "mon", "rwx",
+ FLAG(TELL))
+COMMAND_WITH_FLAG("connection scores reset",
+ "reset the scores used in connectivity-based elections",
+ "mon", "rwx",
+ FLAG(TELL))
+COMMAND_WITH_FLAG("sync_force "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "force sync of and clear monitor store",
+ "mon", "rw",
+ FLAG(TELL))
+COMMAND_WITH_FLAG("add_bootstrap_peer_hint "
+ "name=addr,type=CephIPAddr",
+ "add peer address as potential bootstrap "
+ "peer for cluster bringup",
+ "mon", "rw",
+ FLAG(TELL))
+COMMAND_WITH_FLAG("add_bootstrap_peer_hintv "
+ "name=addrv,type=CephString",
+ "add peer address vector as potential bootstrap "
+ "peer for cluster bringup",
+ "mon", "rw",
+ FLAG(TELL))
+COMMAND_WITH_FLAG("quorum enter ",
+ "force monitor back into quorum",
+ "mon", "rw",
+ FLAG(TELL))
+COMMAND_WITH_FLAG("quorum exit",
+ "force monitor out of the quorum",
+ "mon", "rw",
+ FLAG(TELL))
+COMMAND_WITH_FLAG("ops",
+ "show the ops currently in flight",
+ "mon", "r",
+ FLAG(TELL))
+COMMAND_WITH_FLAG("sessions",
+ "list existing sessions",
+ "mon", "r",
+ FLAG(TELL))
+COMMAND_WITH_FLAG("dump_historic_ops",
+ "show recent ops",
+ "mon", "r",
+ FLAG(TELL))
+COMMAND_WITH_FLAG("dump_historic_slow_ops",
+ "show recent slow ops",
+ "mon", "r",
+ FLAG(TELL))
diff --git a/src/mon/MonMap.cc b/src/mon/MonMap.cc
new file mode 100644
index 000000000..6eb37df17
--- /dev/null
+++ b/src/mon/MonMap.cc
@@ -0,0 +1,1028 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "MonMap.h"
+
+#include <algorithm>
+#include <sstream>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#ifdef WITH_SEASTAR
+#include <seastar/core/fstream.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/net/dns.hh>
+#include "crimson/common/config_proxy.h"
+#endif
+
+#include "common/Formatter.h"
+
+#include "include/ceph_features.h"
+#include "include/addr_parsing.h"
+#include "common/ceph_argparse.h"
+#include "common/ceph_json.h"
+#include "common/dns_resolve.h"
+#include "common/errno.h"
+#include "common/dout.h"
+#include "common/Clock.h"
+#include "mon/health_check.h"
+
+using std::list;
+using std::map;
+using std::ostream;
+using std::ostringstream;
+using std::set;
+using std::string;
+using std::vector;
+
+using ceph::DNSResolver;
+using ceph::Formatter;
+
+#ifdef WITH_SEASTAR
+namespace {
+ seastar::logger& logger()
+ {
+ return crimson::get_logger(ceph_subsys_monc);
+ }
+}
+#endif
+
+void mon_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+ uint8_t v = 5;
+ uint8_t min_v = 1;
+ if (!crush_loc.empty()) {
+ // we added crush_loc in version 5, but need to let old clients decode it
+ // so just leave the min_v at version 1. Monitors are protected
+ // from misunderstandings about location because setting it is blocked
+ // on FEATURE_PINGING
+ min_v = 1;
+ }
+ if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+ v = 2;
+ }
+ ENCODE_START(v, min_v, bl);
+ encode(name, bl);
+ if (v < 3) {
+ ceph_assert(min_v == 1);
+ auto a = public_addrs.legacy_addr();
+ if (a != entity_addr_t()) {
+ encode(a, bl, features);
+ } else {
+ // note: we don't have a legacy addr here, so lie so that it looks
+ // like one, just so that old clients get a valid-looking map.
+ // they won't be able to talk to the v2 mons, but that's better
+ // than nothing.
+ encode(public_addrs.as_legacy_addr(), bl, features);
+ }
+ } else {
+ encode(public_addrs, bl, features);
+ }
+ encode(priority, bl);
+ encode(weight, bl);
+ encode(crush_loc, bl);
+ ENCODE_FINISH(bl);
+}
+
+void mon_info_t::decode(ceph::buffer::list::const_iterator& p)
+{
+ DECODE_START(5, p);
+ decode(name, p);
+ decode(public_addrs, p);
+ if (struct_v >= 2) {
+ decode(priority, p);
+ }
+ if (struct_v >= 4) {
+ decode(weight, p);
+ }
+ if (struct_v >= 5) {
+ decode(crush_loc, p);
+ }
+ DECODE_FINISH(p);
+}
+
+void mon_info_t::print(ostream& out) const
+{
+ out << "mon." << name
+ << " addrs " << public_addrs
+ << " priority " << priority
+ << " weight " << weight
+ << " crush location " << crush_loc;
+}
+
+void mon_info_t::dump(ceph::Formatter *f) const
+{
+ f->dump_string("name", name);
+ f->dump_stream("addr") << public_addrs;
+ f->dump_int("priority", priority);
+ f->dump_float("weight", weight);
+ encode_json("crush_location", crush_loc, f);
+}
+
+void mon_info_t::generate_test_instances(list<mon_info_t*>& ls)
+{
+ ls.push_back(new mon_info_t);
+ ls.push_back(new mon_info_t);
+ ls.back()->name = "noname";
+ ls.back()->public_addrs.parse("v1:1.2.3.4:567/890");
+ ls.back()->priority = 1;
+ ls.back()->weight = 1.0;
+ ls.back()->crush_loc.emplace("root", "default");
+ ls.back()->crush_loc.emplace("host", "foo");
+}
+namespace {
+ struct rank_cmp {
+ bool operator()(const mon_info_t &a, const mon_info_t &b) const {
+ if (a.public_addrs.legacy_or_front_addr() == b.public_addrs.legacy_or_front_addr())
+ return a.name < b.name;
+ return a.public_addrs.legacy_or_front_addr() < b.public_addrs.legacy_or_front_addr();
+ }
+ };
+}
+
+void MonMap::calc_legacy_ranks()
+{
+ ranks.resize(mon_info.size());
+
+ // Used to order entries according to public_addr, because that's
+ // how the ranks are expected to be ordered by. We may expand this
+ // later on, according to some other criteria, by specifying a
+ // different comparator.
+ //
+ // Please note that we use a 'set' here instead of resorting to
+ // std::sort() because we need more info than that's available in
+ // the vector. The vector will thus be ordered by, e.g., public_addr
+ // while only containing the names of each individual monitor.
+ // The only way of achieving this with std::sort() would be to first
+ // insert every mon_info_t entry into a vector 'foo', std::sort() 'foo'
+ // with custom comparison functions, and then copy each invidual entry
+ // to a new vector. Unless there's a simpler way, we don't think the
+ // added complexity makes up for the additional memory usage of a 'set'.
+ set<mon_info_t, rank_cmp> tmp;
+
+ for (auto p = mon_info.begin(); p != mon_info.end(); ++p) {
+ mon_info_t &m = p->second;
+ tmp.insert(m);
+ }
+
+ // map the set to the actual ranks etc
+ unsigned i = 0;
+ for (auto p = tmp.begin(); p != tmp.end(); ++p, ++i) {
+ ranks[i] = p->name;
+ }
+}
+
+void MonMap::encode(ceph::buffer::list& blist, uint64_t con_features) const
+{
+ if ((con_features & CEPH_FEATURE_MONNAMES) == 0) {
+ using ceph::encode;
+ __u16 v = 1;
+ encode(v, blist);
+ ceph::encode_raw(fsid, blist);
+ encode(epoch, blist);
+ vector<entity_inst_t> mon_inst(ranks.size());
+ for (unsigned n = 0; n < ranks.size(); n++) {
+ mon_inst[n].name = entity_name_t::MON(n);
+ mon_inst[n].addr = get_addrs(n).legacy_addr();
+ }
+ encode(mon_inst, blist, con_features);
+ encode(last_changed, blist);
+ encode(created, blist);
+ return;
+ }
+
+ map<string,entity_addr_t> legacy_mon_addr;
+ if (!HAVE_FEATURE(con_features, MONENC) ||
+ !HAVE_FEATURE(con_features, SERVER_NAUTILUS)) {
+ for (auto& [name, info] : mon_info) {
+ legacy_mon_addr[name] = info.public_addrs.legacy_addr();
+ }
+ }
+
+ if (!HAVE_FEATURE(con_features, MONENC)) {
+ /* we keep the mon_addr map when encoding to ensure compatibility
+ * with clients and other monitors that do not yet support the 'mons'
+ * map. This map keeps its original behavior, containing a mapping of
+ * monitor id (i.e., 'foo' in 'mon.foo') to the monitor's public
+ * address -- which is obtained from the public address of each entry
+ * in the 'mons' map.
+ */
+ using ceph::encode;
+ __u16 v = 2;
+ encode(v, blist);
+ ceph::encode_raw(fsid, blist);
+ encode(epoch, blist);
+ encode(legacy_mon_addr, blist, con_features);
+ encode(last_changed, blist);
+ encode(created, blist);
+ return;
+ }
+
+ if (!HAVE_FEATURE(con_features, SERVER_NAUTILUS)) {
+ ENCODE_START(5, 3, blist);
+ ceph::encode_raw(fsid, blist);
+ encode(epoch, blist);
+ encode(legacy_mon_addr, blist, con_features);
+ encode(last_changed, blist);
+ encode(created, blist);
+ encode(persistent_features, blist);
+ encode(optional_features, blist);
+ encode(mon_info, blist, con_features);
+ ENCODE_FINISH(blist);
+ return;
+ }
+
+ ENCODE_START(9, 6, blist);
+ ceph::encode_raw(fsid, blist);
+ encode(epoch, blist);
+ encode(last_changed, blist);
+ encode(created, blist);
+ encode(persistent_features, blist);
+ encode(optional_features, blist);
+ encode(mon_info, blist, con_features);
+ encode(ranks, blist);
+ encode(min_mon_release, blist);
+ encode(removed_ranks, blist);
+ uint8_t t = strategy;
+ encode(t, blist);
+ encode(disallowed_leaders, blist);
+ encode(stretch_mode_enabled, blist);
+ encode(tiebreaker_mon, blist);
+ encode(stretch_marked_down_mons, blist);
+ ENCODE_FINISH(blist);
+}
+
+void MonMap::decode(ceph::buffer::list::const_iterator& p)
+{
+ map<string,entity_addr_t> mon_addr;
+ DECODE_START_LEGACY_COMPAT_LEN_16(9, 3, 3, p);
+ ceph::decode_raw(fsid, p);
+ decode(epoch, p);
+ if (struct_v == 1) {
+ vector<entity_inst_t> mon_inst;
+ decode(mon_inst, p);
+ for (unsigned i = 0; i < mon_inst.size(); i++) {
+ char n[2];
+ n[0] = '0' + i;
+ n[1] = 0;
+ string name = n;
+ mon_addr[name] = mon_inst[i].addr;
+ }
+ } else if (struct_v < 6) {
+ decode(mon_addr, p);
+ }
+ decode(last_changed, p);
+ decode(created, p);
+ if (struct_v >= 4) {
+ decode(persistent_features, p);
+ decode(optional_features, p);
+ }
+ if (struct_v < 5) {
+ // generate mon_info from legacy mon_addr
+ for (auto& [name, addr] : mon_addr) {
+ mon_info_t &m = mon_info[name];
+ m.name = name;
+ m.public_addrs = entity_addrvec_t(addr);
+ }
+ } else {
+ decode(mon_info, p);
+ }
+ if (struct_v < 6) {
+ calc_legacy_ranks();
+ } else {
+ decode(ranks, p);
+ }
+ if (struct_v >= 7) {
+ decode(min_mon_release, p);
+ } else {
+ min_mon_release = infer_ceph_release_from_mon_features(persistent_features);
+ }
+ if (struct_v >= 8) {
+ decode(removed_ranks, p);
+ uint8_t t;
+ decode(t, p);
+ strategy = static_cast<election_strategy>(t);
+ decode(disallowed_leaders, p);
+ }
+ if (struct_v >= 9) {
+ decode(stretch_mode_enabled, p);
+ decode(tiebreaker_mon, p);
+ decode(stretch_marked_down_mons, p);
+ } else {
+ stretch_mode_enabled = false;
+ tiebreaker_mon = "";
+ stretch_marked_down_mons.clear();
+ }
+ calc_addr_mons();
+ DECODE_FINISH(p);
+}
+
+void MonMap::generate_test_instances(list<MonMap*>& o)
+{
+ o.push_back(new MonMap);
+ o.push_back(new MonMap);
+ o.back()->epoch = 1;
+ o.back()->last_changed = utime_t(123, 456);
+ o.back()->created = utime_t(789, 101112);
+ o.back()->add("one", entity_addrvec_t());
+
+ MonMap *m = new MonMap;
+ {
+ m->epoch = 1;
+ m->last_changed = utime_t(123, 456);
+
+ entity_addrvec_t empty_addr_one = entity_addrvec_t(entity_addr_t());
+ empty_addr_one.v[0].set_nonce(1);
+ m->add("empty_addr_one", empty_addr_one);
+ entity_addrvec_t empty_addr_two = entity_addrvec_t(entity_addr_t());
+ empty_addr_two.v[0].set_nonce(2);
+ m->add("empty_addr_two", empty_addr_two);
+
+ const char *local_pub_addr_s = "127.0.1.2";
+
+ const char *end_p = local_pub_addr_s + strlen(local_pub_addr_s);
+ entity_addrvec_t local_pub_addr;
+ local_pub_addr.parse(local_pub_addr_s, &end_p);
+
+ m->add(mon_info_t("filled_pub_addr", entity_addrvec_t(local_pub_addr), 1, 1));
+
+ m->add("empty_addr_zero", entity_addrvec_t());
+ }
+ o.push_back(m);
+}
+
+// read from/write to a file
+int MonMap::write(const char *fn)
+{
+ // encode
+ ceph::buffer::list bl;
+ encode(bl, CEPH_FEATURES_ALL);
+
+ return bl.write_file(fn);
+}
+
+int MonMap::read(const char *fn)
+{
+ // read
+ ceph::buffer::list bl;
+ std::string error;
+ int r = bl.read_file(fn, &error);
+ if (r < 0)
+ return r;
+ decode(bl);
+ return 0;
+}
+
+void MonMap::print_summary(ostream& out) const
+{
+ out << "e" << epoch << ": "
+ << mon_info.size() << " mons at {";
+ // the map that we used to print, as it was, no longer
+ // maps strings to the monitor's public address, but to
+ // mon_info_t instead. As such, print the map in a way
+ // that keeps the expected format.
+ bool has_printed = false;
+ for (auto p = mon_info.begin(); p != mon_info.end(); ++p) {
+ if (has_printed)
+ out << ",";
+ out << p->first << "=" << p->second.public_addrs;
+ has_printed = true;
+ }
+ out << "}" << " removed_ranks: {" << removed_ranks << "}";
+ out << " disallowed_leaders: {" << disallowed_leaders << "}";
+}
+
+void MonMap::print(ostream& out) const
+{
+ out << "epoch " << epoch << "\n";
+ out << "fsid " << fsid << "\n";
+ out << "last_changed " << last_changed << "\n";
+ out << "created " << created << "\n";
+ out << "min_mon_release " << to_integer<unsigned>(min_mon_release)
+ << " (" << min_mon_release << ")\n";
+ out << "election_strategy: " << strategy << "\n";
+ if (stretch_mode_enabled) {
+ out << "stretch_mode_enabled " << stretch_mode_enabled << "\n";
+ out << "tiebreaker_mon " << tiebreaker_mon << "\n";
+ }
+ if (stretch_mode_enabled ||
+ !disallowed_leaders.empty()) {
+ out << "disallowed_leaders " << disallowed_leaders << "\n";
+ }
+ unsigned i = 0;
+ for (auto p = ranks.begin(); p != ranks.end(); ++p) {
+ const auto &mi = mon_info.find(*p);
+ ceph_assert(mi != mon_info.end());
+ out << i++ << ": " << mi->second.public_addrs << " mon." << *p;
+ if (!mi->second.crush_loc.empty()) {
+ out << "; crush_location " << mi->second.crush_loc;
+ }
+ out << "\n";
+ }
+}
+
+void MonMap::dump(Formatter *f) const
+{
+ f->dump_unsigned("epoch", epoch);
+ f->dump_stream("fsid") << fsid;
+ last_changed.gmtime(f->dump_stream("modified"));
+ created.gmtime(f->dump_stream("created"));
+ f->dump_unsigned("min_mon_release", to_integer<unsigned>(min_mon_release));
+ f->dump_string("min_mon_release_name", to_string(min_mon_release));
+ f->dump_int ("election_strategy", strategy);
+ f->dump_stream("disallowed_leaders: ") << disallowed_leaders;
+ f->dump_bool("stretch_mode", stretch_mode_enabled);
+ f->dump_string("tiebreaker_mon", tiebreaker_mon);
+ f->dump_stream("removed_ranks: ") << removed_ranks;
+ f->open_object_section("features");
+ persistent_features.dump(f, "persistent");
+ optional_features.dump(f, "optional");
+ f->close_section();
+ f->open_array_section("mons");
+ int i = 0;
+ for (auto p = ranks.begin(); p != ranks.end(); ++p, ++i) {
+ f->open_object_section("mon");
+ f->dump_int("rank", i);
+ f->dump_string("name", *p);
+ f->dump_object("public_addrs", get_addrs(*p));
+ // compat: make these look like pre-nautilus entity_addr_t
+ f->dump_stream("addr") << get_addrs(*p).get_legacy_str();
+ f->dump_stream("public_addr") << get_addrs(*p).get_legacy_str();
+ f->dump_unsigned("priority", get_priority(*p));
+ f->dump_unsigned("weight", get_weight(*p));
+ const auto &mi = mon_info.find(*p);
+ // we don't need to assert this validity as all the get_* functions did
+ f->dump_stream("crush_location") << mi->second.crush_loc;
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void MonMap::dump_summary(Formatter *f) const
+{
+ f->dump_unsigned("epoch", epoch);
+ f->dump_string("min_mon_release_name", to_string(min_mon_release));
+ f->dump_unsigned("num_mons", ranks.size());
+}
+
+// an ambiguous mon addr may be legacy or may be msgr2--we aren' sure.
+// when that happens we need to try them both (unless we can
+// reasonably infer from the port number which it is).
+void MonMap::_add_ambiguous_addr(const string& name,
+ entity_addr_t addr,
+ int priority,
+ int weight,
+ bool for_mkfs)
+{
+ if (addr.get_type() != entity_addr_t::TYPE_ANY) {
+ // a v1: or v2: prefix was specified
+ if (addr.get_port() == 0) {
+ // use default port
+ if (addr.get_type() == entity_addr_t::TYPE_LEGACY) {
+ addr.set_port(CEPH_MON_PORT_LEGACY);
+ } else if (addr.get_type() == entity_addr_t::TYPE_MSGR2) {
+ addr.set_port(CEPH_MON_PORT_IANA);
+ } else {
+ // wth
+ return;
+ }
+ if (!contains(addr)) {
+ add(name, entity_addrvec_t(addr), priority, weight);
+ }
+ } else {
+ if (!contains(addr)) {
+ add(name, entity_addrvec_t(addr), priority, weight);
+ }
+ }
+ } else {
+ // no v1: or v2: prefix specified
+ if (addr.get_port() == CEPH_MON_PORT_LEGACY) {
+ // legacy port implies legacy addr
+ addr.set_type(entity_addr_t::TYPE_LEGACY);
+ if (!contains(addr)) {
+ if (!for_mkfs) {
+ add(name + "-legacy", entity_addrvec_t(addr), priority, weight);
+ } else {
+ add(name, entity_addrvec_t(addr), priority, weight);
+ }
+ }
+ } else if (addr.get_port() == CEPH_MON_PORT_IANA) {
+ // iana port implies msgr2 addr
+ addr.set_type(entity_addr_t::TYPE_MSGR2);
+ if (!contains(addr)) {
+ add(name, entity_addrvec_t(addr), priority, weight);
+ }
+ } else if (addr.get_port() == 0) {
+ // no port; include both msgr2 and legacy ports
+ if (!for_mkfs) {
+ addr.set_type(entity_addr_t::TYPE_MSGR2);
+ addr.set_port(CEPH_MON_PORT_IANA);
+ if (!contains(addr)) {
+ add(name, entity_addrvec_t(addr), priority, weight);
+ }
+ addr.set_type(entity_addr_t::TYPE_LEGACY);
+ addr.set_port(CEPH_MON_PORT_LEGACY);
+ if (!contains(addr)) {
+ add(name + "-legacy", entity_addrvec_t(addr), priority, weight);
+ }
+ } else {
+ entity_addrvec_t av;
+ addr.set_type(entity_addr_t::TYPE_MSGR2);
+ addr.set_port(CEPH_MON_PORT_IANA);
+ av.v.push_back(addr);
+ addr.set_type(entity_addr_t::TYPE_LEGACY);
+ addr.set_port(CEPH_MON_PORT_LEGACY);
+ av.v.push_back(addr);
+ if (!contains(av)) {
+ add(name, av, priority, weight);
+ }
+ }
+ } else {
+ addr.set_type(entity_addr_t::TYPE_MSGR2);
+ if (!contains(addr)) {
+ add(name, entity_addrvec_t(addr), priority, weight);
+ }
+ if (!for_mkfs) {
+ // try legacy on same port too
+ addr.set_type(entity_addr_t::TYPE_LEGACY);
+ if (!contains(addr)) {
+ add(name + "-legacy", entity_addrvec_t(addr), priority, weight);
+ }
+ }
+ }
+ }
+}
+
+void MonMap::init_with_addrs(const std::vector<entity_addrvec_t>& addrs,
+ bool for_mkfs,
+ std::string_view prefix)
+{
+ char id = 'a';
+ for (auto& addr : addrs) {
+ string name{prefix};
+ name += id++;
+ if (addr.v.size() == 1) {
+ _add_ambiguous_addr(name, addr.front(), 0, 0, for_mkfs);
+ } else {
+ // they specified an addrvec, so let's assume they also specified
+ // the addr *type* and *port*. (we could possibly improve this?)
+ add(name, addr, 0);
+ }
+ }
+}
+
+int MonMap::init_with_ips(const std::string& ips,
+ bool for_mkfs,
+ std::string_view prefix)
+{
+ vector<entity_addrvec_t> addrs;
+ if (!parse_ip_port_vec(
+ ips.c_str(), addrs,
+ entity_addr_t::TYPE_ANY)) {
+ return -EINVAL;
+ }
+ if (addrs.empty())
+ return -ENOENT;
+ init_with_addrs(addrs, for_mkfs, prefix);
+ return 0;
+}
+
+int MonMap::init_with_hosts(const std::string& hostlist,
+ bool for_mkfs,
+ std::string_view prefix)
+{
+ // maybe they passed us a DNS-resolvable name
+ char *hosts = resolve_addrs(hostlist.c_str());
+ if (!hosts)
+ return -EINVAL;
+
+ vector<entity_addrvec_t> addrs;
+ bool success = parse_ip_port_vec(
+ hosts, addrs,
+ entity_addr_t::TYPE_ANY);
+ free(hosts);
+ if (!success)
+ return -EINVAL;
+ if (addrs.empty())
+ return -ENOENT;
+ init_with_addrs(addrs, for_mkfs, prefix);
+ calc_legacy_ranks();
+ return 0;
+}
+
+void MonMap::set_initial_members(CephContext *cct,
+ list<std::string>& initial_members,
+ string my_name,
+ const entity_addrvec_t& my_addrs,
+ set<entity_addrvec_t> *removed)
+{
+ // remove non-initial members
+ unsigned i = 0;
+ while (i < size()) {
+ string n = get_name(i);
+ if (std::find(initial_members.begin(), initial_members.end(), n)
+ != initial_members.end()) {
+ lgeneric_dout(cct, 1) << " keeping " << n << " " << get_addrs(i) << dendl;
+ i++;
+ continue;
+ }
+
+ lgeneric_dout(cct, 1) << " removing " << get_name(i) << " " << get_addrs(i)
+ << dendl;
+ if (removed) {
+ removed->insert(get_addrs(i));
+ }
+ remove(n);
+ ceph_assert(!contains(n));
+ }
+
+ // add missing initial members
+ for (auto& p : initial_members) {
+ if (!contains(p)) {
+ if (p == my_name) {
+ lgeneric_dout(cct, 1) << " adding self " << p << " " << my_addrs
+ << dendl;
+ add(p, my_addrs);
+ } else {
+ entity_addr_t a;
+ a.set_type(entity_addr_t::TYPE_LEGACY);
+ a.set_family(AF_INET);
+ for (int n=1; ; n++) {
+ a.set_nonce(n);
+ if (!contains(a))
+ break;
+ }
+ lgeneric_dout(cct, 1) << " adding " << p << " " << a << dendl;
+ add(p, entity_addrvec_t(a));
+ }
+ ceph_assert(contains(p));
+ }
+ }
+ calc_legacy_ranks();
+}
+
+int MonMap::init_with_config_file(const ConfigProxy& conf,
+ std::ostream& errout)
+{
+ std::vector<std::string> sections;
+ int ret = conf.get_all_sections(sections);
+ if (ret) {
+ errout << "Unable to find any monitors in the configuration "
+ << "file, because there was an error listing the sections. error "
+ << ret << std::endl;
+ return -ENOENT;
+ }
+ std::vector<std::string> mon_names;
+ for (const auto& section : sections) {
+ if (section.substr(0, 4) == "mon." && section.size() > 4) {
+ mon_names.push_back(section.substr(4));
+ }
+ }
+
+ // Find an address for each monitor in the config file.
+ for (const auto& mon_name : mon_names) {
+ std::vector<std::string> sections;
+ std::string m_name("mon");
+ m_name += ".";
+ m_name += mon_name;
+ sections.push_back(m_name);
+ sections.push_back("mon");
+ sections.push_back("global");
+ std::string val;
+ int res = conf.get_val_from_conf_file(sections, "mon addr", val, true);
+ if (res) {
+ errout << "failed to get an address for mon." << mon_name
+ << ": error " << res << std::endl;
+ continue;
+ }
+ // the 'mon addr' field is a legacy field, so assume anything
+ // there on a weird port is a v1 address, and do not handle
+ // addrvecs.
+ entity_addr_t addr;
+ if (!addr.parse(val, entity_addr_t::TYPE_LEGACY)) {
+ errout << "unable to parse address for mon." << mon_name
+ << ": addr='" << val << "'" << std::endl;
+ continue;
+ }
+ if (addr.get_port() == 0) {
+ addr.set_port(CEPH_MON_PORT_LEGACY);
+ }
+ uint16_t priority = 0;
+ if (!conf.get_val_from_conf_file(sections, "mon priority", val, false)) {
+ try {
+ priority = std::stoul(val);
+ } catch (std::logic_error&) {
+ errout << "unable to parse priority for mon." << mon_name
+ << ": priority='" << val << "'" << std::endl;
+ continue;
+ }
+ }
+ uint16_t weight = 0;
+ if (!conf.get_val_from_conf_file(sections, "mon weight", val, false)) {
+ try {
+ weight = std::stoul(val);
+ } catch (std::logic_error&) {
+ errout << "unable to parse weight for mon." << mon_name
+ << ": weight='" << val << "'"
+ << std::endl;
+ continue;
+ }
+ }
+
+ // make sure this mon isn't already in the map
+ if (contains(addr))
+ remove(get_name(addr));
+ if (contains(mon_name))
+ remove(mon_name);
+ _add_ambiguous_addr(mon_name, addr, priority, weight, false);
+ }
+ return 0;
+}
+
+void MonMap::check_health(health_check_map_t *checks) const
+{
+ if (stretch_mode_enabled) {
+ list<string> detail;
+ for (auto& p : mon_info) {
+ if (p.second.crush_loc.empty()) {
+ ostringstream ss;
+ ss << "mon " << p.first << " has no location set while in stretch mode";
+ detail.push_back(ss.str());
+ }
+ }
+ if (!detail.empty()) {
+ ostringstream ss;
+ ss << detail.size() << " monitor(s) have no location set while in stretch mode"
+ << "; this may cause issues with failover, OSD connections, netsplit handling, etc";
+ auto& d = checks->add("MON_LOCATION_NOT_SET", HEALTH_WARN,
+ ss.str(), detail.size());
+ d.detail.swap(detail);
+ }
+ }
+}
+
+#ifdef WITH_SEASTAR
+
+seastar::future<> MonMap::read_monmap(const std::string& monmap)
+{
+ using namespace seastar;
+ return open_file_dma(monmap, open_flags::ro).then([this] (file f) {
+ return f.size().then([this, f = std::move(f)](size_t s) {
+ return do_with(make_file_input_stream(f), [this, s](input_stream<char>& in) {
+ return in.read_exactly(s).then([this](temporary_buffer<char> buf) {
+ ceph::buffer::list bl;
+ bl.push_back(ceph::buffer::ptr_node::create(
+ ceph::buffer::create(std::move(buf))));
+ decode(bl);
+ });
+ });
+ });
+ });
+}
+
+seastar::future<> MonMap::init_with_dns_srv(bool for_mkfs, const std::string& name)
+{
+ logger().debug("{}: for_mkfs={}, name={}", __func__, for_mkfs, name);
+ string domain;
+ string service = name;
+ // check if domain is also provided and extract it from srv_name
+ size_t idx = name.find("_");
+ if (idx != name.npos) {
+ domain = name.substr(idx + 1);
+ service = name.substr(0, idx);
+ }
+ return seastar::net::dns::get_srv_records(
+ seastar::net::dns_resolver::srv_proto::tcp,
+ service, domain).then([this](seastar::net::dns_resolver::srv_records records) {
+ return seastar::parallel_for_each(records, [this](auto record) {
+ return seastar::net::dns::resolve_name(record.target).then(
+ [record,this](seastar::net::inet_address a) {
+ // the resolved address does not contain ceph specific info like nonce
+ // nonce or msgr proto (legacy, msgr2), so set entity_addr_t manually
+ entity_addr_t addr;
+ addr.set_type(entity_addr_t::TYPE_ANY);
+ addr.set_family(int(a.in_family()));
+ addr.set_port(record.port);
+ switch (a.in_family()) {
+ case seastar::net::inet_address::family::INET:
+ addr.in4_addr().sin_addr = a;
+ break;
+ case seastar::net::inet_address::family::INET6:
+ addr.in6_addr().sin6_addr = a;
+ break;
+ }
+ _add_ambiguous_addr(record.target,
+ addr,
+ record.priority,
+ record.weight,
+ false);
+ }).handle_exception_type([t=record.target](const std::system_error& e) {
+ logger().debug("{}: unable to resolve name for {}: {}",
+ "init_with_dns_srv", t, e);
+ });
+ });
+ }).handle_exception_type([name](const std::system_error& e) {
+ logger().debug("{}: unable to get monitor info from DNS SRV with {}: {}",
+ "init_with_dns_srv", name, e);
+ // ignore DNS failures
+ return seastar::make_ready_future<>();
+ });
+}
+
+bool MonMap::maybe_init_with_mon_host(const std::string& mon_host,
+ const bool for_mkfs)
+{
+ if (!mon_host.empty()) {
+ if (auto ret = init_with_ips(mon_host, for_mkfs, "noname-"); ret == 0) {
+ return true;
+ }
+ // TODO: resolve_addrs() is a blocking call
+ if (auto ret = init_with_hosts(mon_host, for_mkfs, "noname-"); ret == 0) {
+ return true;
+ } else {
+ throw std::runtime_error(cpp_strerror(ret));
+ }
+ }
+ return false;
+}
+
+seastar::future<> MonMap::build_monmap(const crimson::common::ConfigProxy& conf,
+ bool for_mkfs)
+{
+ logger().debug("{}: for_mkfs={}", __func__, for_mkfs);
+ // -m foo?
+ if (maybe_init_with_mon_host(conf.get_val<std::string>("mon_host"), for_mkfs)) {
+ return seastar::make_ready_future<>();
+ }
+
+ // What monitors are in the config file?
+ ostringstream errout;
+ if (auto ret = init_with_config_file(conf, errout); ret < 0) {
+ throw std::runtime_error(errout.str());
+ }
+ if (size() > 0) {
+ return seastar::make_ready_future<>();
+ }
+ // no info found from conf options lets try use DNS SRV records
+ const string srv_name = conf.get_val<std::string>("mon_dns_srv_name");
+ return init_with_dns_srv(for_mkfs, srv_name).then([this] {
+ if (size() == 0) {
+ throw std::runtime_error("no monitors specified to connect to.");
+ }
+ });
+}
+
+seastar::future<> MonMap::build_initial(const crimson::common::ConfigProxy& conf, bool for_mkfs)
+{
+ // mon_host_override?
+ if (maybe_init_with_mon_host(conf.get_val<std::string>("mon_host_override"),
+ for_mkfs)) {
+ return seastar::make_ready_future<>();
+ }
+
+ // file?
+ if (const auto monmap = conf.get_val<std::string>("monmap");
+ !monmap.empty()) {
+ return read_monmap(monmap);
+ } else {
+ // fsid from conf?
+ if (const auto new_fsid = conf.get_val<uuid_d>("fsid");
+ !new_fsid.is_zero()) {
+ fsid = new_fsid;
+ }
+ return build_monmap(conf, for_mkfs).then([this] {
+ created = ceph_clock_now();
+ last_changed = created;
+ calc_legacy_ranks();
+ });
+ }
+}
+
+#else // WITH_SEASTAR
+
+int MonMap::init_with_monmap(const std::string& monmap, std::ostream& errout)
+{
+ int r;
+ try {
+ r = read(monmap.c_str());
+ } catch (ceph::buffer::error&) {
+ r = -EINVAL;
+ }
+ if (r >= 0)
+ return 0;
+ errout << "unable to read/decode monmap from " << monmap
+ << ": " << cpp_strerror(-r) << std::endl;
+ return r;
+}
+
+int MonMap::init_with_dns_srv(CephContext* cct,
+ std::string srv_name,
+ bool for_mkfs,
+ std::ostream& errout)
+{
+ lgeneric_dout(cct, 1) << __func__ << " srv_name: " << srv_name << dendl;
+
+ string domain;
+ // check if domain is also provided and extract it from srv_name
+ size_t idx = srv_name.find("_");
+ if (idx != string::npos) {
+ domain = srv_name.substr(idx + 1);
+ srv_name = srv_name.substr(0, idx);
+ }
+
+ map<string, DNSResolver::Record> records;
+ if (DNSResolver::get_instance()->resolve_srv_hosts(cct, srv_name,
+ DNSResolver::SRV_Protocol::TCP, domain, &records) != 0) {
+
+ errout << "unable to get monitor info from DNS SRV with service name: "
+ << "ceph-mon" << std::endl;
+ return -1;
+ } else {
+ for (auto& record : records) {
+ record.second.addr.set_type(entity_addr_t::TYPE_ANY);
+ _add_ambiguous_addr(record.first,
+ record.second.addr,
+ record.second.priority,
+ record.second.weight,
+ false);
+ }
+ return 0;
+ }
+}
+
+int MonMap::build_initial(CephContext *cct, bool for_mkfs, ostream& errout)
+{
+ lgeneric_dout(cct, 1) << __func__ << " for_mkfs: " << for_mkfs << dendl;
+ const auto& conf = cct->_conf;
+
+ // mon_host_override?
+ auto mon_host_override = conf.get_val<std::string>("mon_host_override");
+ if (!mon_host_override.empty()) {
+ lgeneric_dout(cct, 1) << "Using mon_host_override " << mon_host_override << dendl;
+ auto ret = init_with_ips(mon_host_override, for_mkfs, "noname-");
+ if (ret == -EINVAL) {
+ ret = init_with_hosts(mon_host_override, for_mkfs, "noname-");
+ }
+ if (ret < 0) {
+ errout << "unable to parse addrs in '" << mon_host_override << "'"
+ << std::endl;
+ }
+ return ret;
+ }
+
+ // cct?
+ auto addrs = cct->get_mon_addrs();
+ if (addrs != nullptr && (addrs->size() > 0)) {
+ init_with_addrs(*addrs, for_mkfs, "noname-");
+ return 0;
+ }
+
+ // file?
+ if (const auto monmap = conf.get_val<std::string>("monmap");
+ !monmap.empty()) {
+ return init_with_monmap(monmap, errout);
+ }
+
+ // fsid from conf?
+ if (const auto new_fsid = conf.get_val<uuid_d>("fsid");
+ !new_fsid.is_zero()) {
+ fsid = new_fsid;
+ }
+ // -m foo?
+ if (const auto mon_host = conf.get_val<std::string>("mon_host");
+ !mon_host.empty()) {
+ auto ret = init_with_ips(mon_host, for_mkfs, "noname-");
+ if (ret == -EINVAL) {
+ ret = init_with_hosts(mon_host, for_mkfs, "noname-");
+ }
+ if (ret < 0) {
+ errout << "unable to parse addrs in '" << mon_host << "'"
+ << std::endl;
+ return ret;
+ }
+ }
+ if (size() == 0) {
+ // What monitors are in the config file?
+ if (auto ret = init_with_config_file(conf, errout); ret < 0) {
+ return ret;
+ }
+ }
+ if (size() == 0) {
+ // no info found from conf options lets try use DNS SRV records
+ string srv_name = conf.get_val<std::string>("mon_dns_srv_name");
+ if (auto ret = init_with_dns_srv(cct, srv_name, for_mkfs, errout); ret < 0) {
+ return -ENOENT;
+ }
+ }
+ if (size() == 0) {
+ errout << "no monitors specified to connect to." << std::endl;
+ return -ENOENT;
+ }
+ strategy = static_cast<election_strategy>(conf.get_val<uint64_t>("mon_election_default_strategy"));
+ created = ceph_clock_now();
+ last_changed = created;
+ calc_legacy_ranks();
+ return 0;
+}
+#endif // WITH_SEASTAR
diff --git a/src/mon/MonMap.h b/src/mon/MonMap.h
new file mode 100644
index 000000000..5bd72b1d9
--- /dev/null
+++ b/src/mon/MonMap.h
@@ -0,0 +1,550 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MONMAP_H
+#define CEPH_MONMAP_H
+
+#ifdef WITH_SEASTAR
+#include <seastar/core/future.hh>
+#endif
+
+#include "common/config_fwd.h"
+#include "common/ceph_releases.h"
+
+#include "include/err.h"
+#include "include/types.h"
+
+#include "mon/mon_types.h"
+#include "msg/Message.h"
+
+class health_check_map_t;
+
+#ifdef WITH_SEASTAR
+namespace crimson::common {
+ class ConfigProxy;
+}
+#endif
+
+namespace ceph {
+ class Formatter;
+}
+
+struct mon_info_t {
+ /**
+ * monitor name
+ *
+ * i.e., 'foo' in 'mon.foo'
+ */
+ std::string name;
+ /**
+ * monitor's public address(es)
+ *
+ * public facing address(es), used to communicate with all clients
+ * and with other monitors.
+ */
+ entity_addrvec_t public_addrs;
+ /**
+ * the priority of the mon, the lower value the more preferred
+ */
+ uint16_t priority{0};
+ uint16_t weight{0};
+
+ /**
+ * The location of the monitor, in CRUSH hierarchy terms
+ */
+ std::map<std::string,std::string> crush_loc;
+
+ // <REMOVE ME>
+ mon_info_t(const std::string& n, const entity_addr_t& p_addr, uint16_t p)
+ : name(n), public_addrs(p_addr), priority(p)
+ {}
+ // </REMOVE ME>
+
+ mon_info_t(const std::string& n, const entity_addrvec_t& p_addrs,
+ uint16_t p, uint16_t w)
+ : name(n), public_addrs(p_addrs), priority(p), weight(w)
+ {}
+ mon_info_t(const std::string &n, const entity_addrvec_t& p_addrs)
+ : name(n), public_addrs(p_addrs)
+ { }
+
+ mon_info_t() { }
+
+
+ void encode(ceph::buffer::list& bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator& p);
+ void print(std::ostream& out) const;
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<mon_info_t*>& ls);
+};
+WRITE_CLASS_ENCODER_FEATURES(mon_info_t)
+
+inline std::ostream& operator<<(std::ostream& out, const mon_info_t& mon) {
+ mon.print(out);
+ return out;
+}
+
+class MonMap {
+ public:
+ epoch_t epoch; // what epoch/version of the monmap
+ uuid_d fsid;
+ utime_t last_changed;
+ utime_t created;
+
+ std::map<std::string, mon_info_t> mon_info;
+ std::map<entity_addr_t, std::string> addr_mons;
+
+ std::vector<std::string> ranks;
+ /* ranks which were removed when this map took effect.
+ There should only be one at a time, but leave support
+ for arbitrary numbers just to be safe. */
+ std::set<unsigned> removed_ranks;
+
+ /**
+ * Persistent Features are all those features that once set on a
+ * monmap cannot, and should not, be removed. These will define the
+ * non-negotiable features that a given monitor must support to
+ * properly operate in a given quorum.
+ *
+ * Should be reserved for features that we really want to make sure
+ * are sticky, and are important enough to tolerate not being able
+ * to downgrade a monitor.
+ */
+ mon_feature_t persistent_features;
+ /**
+ * Optional Features are all those features that can be enabled or
+ * disabled following a given criteria -- e.g., user-mandated via the
+ * cli --, and act much like indicators of what the cluster currently
+ * supports.
+ *
+ * They are by no means "optional" in the sense that monitors can
+ * ignore them. Just that they are not persistent.
+ */
+ mon_feature_t optional_features;
+
+ /**
+ * Returns the set of features required by this monmap.
+ *
+ * The features required by this monmap is the union of all the
+ * currently set persistent features and the currently set optional
+ * features.
+ *
+ * @returns the set of features required by this monmap
+ */
+ mon_feature_t get_required_features() const {
+ return (persistent_features | optional_features);
+ }
+
+ // upgrade gate
+ ceph_release_t min_mon_release{ceph_release_t::unknown};
+
+ void _add_ambiguous_addr(const std::string& name,
+ entity_addr_t addr,
+ int priority,
+ int weight,
+ bool for_mkfs);
+
+ enum election_strategy {
+ // Keep in sync with ElectionLogic.h!
+ CLASSIC = 1, // the original rank-based one
+ DISALLOW = 2, // disallow a set from being leader
+ CONNECTIVITY = 3 // includes DISALLOW, extends to prefer stronger connections
+ };
+ election_strategy strategy = CLASSIC;
+ std::set<std::string> disallowed_leaders; // can't be leader under CONNECTIVITY/DISALLOW
+ bool stretch_mode_enabled = false;
+ std::string tiebreaker_mon;
+ std::set<std::string> stretch_marked_down_mons; // can't be leader until fully recovered
+
+public:
+ void calc_legacy_ranks();
+ void calc_addr_mons() {
+ // populate addr_mons
+ addr_mons.clear();
+ for (auto& p : mon_info) {
+ for (auto& a : p.second.public_addrs.v) {
+ addr_mons[a] = p.first;
+ }
+ }
+ }
+
+ MonMap()
+ : epoch(0) {
+ }
+
+ uuid_d& get_fsid() { return fsid; }
+
+ unsigned size() const {
+ return mon_info.size();
+ }
+
+ unsigned min_quorum_size(unsigned total_mons=0) const {
+ if (total_mons == 0) {
+ total_mons = size();
+ }
+ return total_mons / 2 + 1;
+ }
+
+ epoch_t get_epoch() const { return epoch; }
+ void set_epoch(epoch_t e) { epoch = e; }
+
+ /**
+ * Obtain list of public facing addresses
+ *
+ * @param ls list to populate with the monitors' addresses
+ */
+ void list_addrs(std::list<entity_addr_t>& ls) const {
+ for (auto& i : mon_info) {
+ for (auto& j : i.second.public_addrs.v) {
+ ls.push_back(j);
+ }
+ }
+ }
+
+ /**
+ * Add new monitor to the monmap
+ *
+ * @param m monitor info of the new monitor
+ */
+ void add(const mon_info_t& m) {
+ ceph_assert(mon_info.count(m.name) == 0);
+ for (auto& a : m.public_addrs.v) {
+ ceph_assert(addr_mons.count(a) == 0);
+ }
+ mon_info[m.name] = m;
+ if (get_required_features().contains_all(
+ ceph::features::mon::FEATURE_NAUTILUS)) {
+ ranks.push_back(m.name);
+ ceph_assert(ranks.size() == mon_info.size());
+ } else {
+ calc_legacy_ranks();
+ }
+ calc_addr_mons();
+ }
+
+ /**
+ * Add new monitor to the monmap
+ *
+ * @param name Monitor name (i.e., 'foo' in 'mon.foo')
+ * @param addr Monitor's public address
+ */
+ void add(const std::string &name, const entity_addrvec_t &addrv,
+ uint16_t priority=0, uint16_t weight=0) {
+ add(mon_info_t(name, addrv, priority, weight));
+ }
+
+ /**
+ * Remove monitor from the monmap
+ *
+ * @param name Monitor name (i.e., 'foo' in 'mon.foo')
+ */
+ void remove(const std::string &name) {
+ // this must match what we do in ConnectionTracker::notify_rank_removed
+ ceph_assert(mon_info.count(name));
+ int rank = get_rank(name);
+ mon_info.erase(name);
+ disallowed_leaders.erase(name);
+ ceph_assert(mon_info.count(name) == 0);
+ if (rank >= 0 ) {
+ removed_ranks.insert(rank);
+ }
+ if (get_required_features().contains_all(
+ ceph::features::mon::FEATURE_NAUTILUS)) {
+ ranks.erase(std::find(ranks.begin(), ranks.end(), name));
+ ceph_assert(ranks.size() == mon_info.size());
+ } else {
+ calc_legacy_ranks();
+ }
+ calc_addr_mons();
+ }
+
+ /**
+ * Rename monitor from @p oldname to @p newname
+ *
+ * @param oldname monitor's current name (i.e., 'foo' in 'mon.foo')
+ * @param newname monitor's new name (i.e., 'bar' in 'mon.bar')
+ */
+ void rename(std::string oldname, std::string newname) {
+ ceph_assert(contains(oldname));
+ ceph_assert(!contains(newname));
+ mon_info[newname] = mon_info[oldname];
+ mon_info.erase(oldname);
+ mon_info[newname].name = newname;
+ if (get_required_features().contains_all(
+ ceph::features::mon::FEATURE_NAUTILUS)) {
+ *std::find(ranks.begin(), ranks.end(), oldname) = newname;
+ ceph_assert(ranks.size() == mon_info.size());
+ } else {
+ calc_legacy_ranks();
+ }
+ calc_addr_mons();
+ }
+
+ int set_rank(const std::string& name, int rank) {
+ int oldrank = get_rank(name);
+ if (oldrank < 0) {
+ return -ENOENT;
+ }
+ if (rank < 0 || rank >= (int)ranks.size()) {
+ return -EINVAL;
+ }
+ if (oldrank != rank) {
+ ranks.erase(ranks.begin() + oldrank);
+ ranks.insert(ranks.begin() + rank, name);
+ }
+ return 0;
+ }
+
+ bool contains(const std::string& name) const {
+ return mon_info.count(name);
+ }
+
+ /**
+ * Check if monmap contains a monitor with address @p a
+ *
+ * @note checks for all addresses a monitor may have, public or otherwise.
+ *
+ * @param a monitor address
+ * @returns true if monmap contains a monitor with address @p;
+ * false otherwise.
+ */
+ bool contains(const entity_addr_t &a, std::string *name=nullptr) const {
+ for (auto& i : mon_info) {
+ for (auto& j : i.second.public_addrs.v) {
+ if (j == a) {
+ if (name) {
+ *name = i.first;
+ }
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+ bool contains(const entity_addrvec_t &av, std::string *name=nullptr) const {
+ for (auto& i : mon_info) {
+ for (auto& j : i.second.public_addrs.v) {
+ for (auto& k : av.v) {
+ if (j == k) {
+ if (name) {
+ *name = i.first;
+ }
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ std::string get_name(unsigned n) const {
+ ceph_assert(n < ranks.size());
+ return ranks[n];
+ }
+ std::string get_name(const entity_addr_t& a) const {
+ std::map<entity_addr_t, std::string>::const_iterator p = addr_mons.find(a);
+ if (p == addr_mons.end())
+ return std::string();
+ else
+ return p->second;
+ }
+ std::string get_name(const entity_addrvec_t& av) const {
+ for (auto& i : av.v) {
+ std::map<entity_addr_t, std::string>::const_iterator p = addr_mons.find(i);
+ if (p != addr_mons.end())
+ return p->second;
+ }
+ return std::string();
+ }
+
+ int get_rank(const std::string& n) const {
+ if (auto found = std::find(ranks.begin(), ranks.end(), n);
+ found != ranks.end()) {
+ return std::distance(ranks.begin(), found);
+ } else {
+ return -1;
+ }
+ }
+ int get_rank(const entity_addr_t& a) const {
+ std::string n = get_name(a);
+ if (!n.empty()) {
+ return get_rank(n);
+ }
+ return -1;
+ }
+ int get_rank(const entity_addrvec_t& av) const {
+ std::string n = get_name(av);
+ if (!n.empty()) {
+ return get_rank(n);
+ }
+ return -1;
+ }
+ bool get_addr_name(const entity_addr_t& a, std::string& name) {
+ if (addr_mons.count(a) == 0)
+ return false;
+ name = addr_mons[a];
+ return true;
+ }
+
+ const entity_addrvec_t& get_addrs(const std::string& n) const {
+ ceph_assert(mon_info.count(n));
+ std::map<std::string,mon_info_t>::const_iterator p = mon_info.find(n);
+ return p->second.public_addrs;
+ }
+ const entity_addrvec_t& get_addrs(unsigned m) const {
+ ceph_assert(m < ranks.size());
+ return get_addrs(ranks[m]);
+ }
+ void set_addrvec(const std::string& n, const entity_addrvec_t& a) {
+ ceph_assert(mon_info.count(n));
+ mon_info[n].public_addrs = a;
+ calc_addr_mons();
+ }
+ uint16_t get_priority(const std::string& n) const {
+ auto it = mon_info.find(n);
+ ceph_assert(it != mon_info.end());
+ return it->second.priority;
+ }
+ uint16_t get_weight(const std::string& n) const {
+ auto it = mon_info.find(n);
+ ceph_assert(it != mon_info.end());
+ return it->second.weight;
+ }
+ void set_weight(const std::string& n, uint16_t v) {
+ auto it = mon_info.find(n);
+ ceph_assert(it != mon_info.end());
+ it->second.weight = v;
+ }
+
+ void encode(ceph::buffer::list& blist, uint64_t con_features) const;
+ void decode(ceph::buffer::list& blist) {
+ auto p = std::cbegin(blist);
+ decode(p);
+ }
+ void decode(ceph::buffer::list::const_iterator& p);
+
+ void generate_fsid() {
+ fsid.generate_random();
+ }
+
+ // read from/write to a file
+ int write(const char *fn);
+ int read(const char *fn);
+
+ /**
+ * build an initial bootstrap monmap from conf
+ *
+ * Build an initial bootstrap monmap from the config. This will
+ * try, in this order:
+ *
+ * 1 monmap -- an explicitly provided monmap
+ * 2 mon_host -- list of monitors
+ * 3 config [mon.*] sections, and 'mon addr' fields in those sections
+ *
+ * @param cct context (and associated config)
+ * @param errout std::ostream to send error messages too
+ */
+#ifdef WITH_SEASTAR
+ seastar::future<> build_initial(const crimson::common::ConfigProxy& conf, bool for_mkfs);
+#else
+ int build_initial(CephContext *cct, bool for_mkfs, std::ostream& errout);
+#endif
+ /**
+ * filter monmap given a set of initial members.
+ *
+ * Remove mons that aren't in the initial_members list. Add missing
+ * mons and give them dummy IPs (blank IPv4, with a non-zero
+ * nonce). If the name matches my_name, then my_addr will be used in
+ * place of a dummy addr.
+ *
+ * @param initial_members list of initial member names
+ * @param my_name name of self, can be blank
+ * @param my_addr my addr
+ * @param removed optional pointer to set to insert removed mon addrs to
+ */
+ void set_initial_members(CephContext *cct,
+ std::list<std::string>& initial_members,
+ std::string my_name,
+ const entity_addrvec_t& my_addrs,
+ std::set<entity_addrvec_t> *removed);
+
+ void print(std::ostream& out) const;
+ void print_summary(std::ostream& out) const;
+ void dump(ceph::Formatter *f) const;
+ void dump_summary(ceph::Formatter *f) const;
+
+ void check_health(health_check_map_t *checks) const;
+
+ static void generate_test_instances(std::list<MonMap*>& o);
+protected:
+ /**
+ * build a monmap from a list of entity_addrvec_t's
+ *
+ * Give mons dummy names.
+ *
+ * @param addrs list of entity_addrvec_t's
+ * @param prefix prefix to prepend to generated mon names
+ */
+ void init_with_addrs(const std::vector<entity_addrvec_t>& addrs,
+ bool for_mkfs,
+ std::string_view prefix);
+ /**
+ * build a monmap from a list of ips
+ *
+ * Give mons dummy names.
+ *
+ * @param hosts list of ips, space or comma separated
+ * @param prefix prefix to prepend to generated mon names
+ * @return 0 for success, -errno on error
+ */
+ int init_with_ips(const std::string& ips,
+ bool for_mkfs,
+ std::string_view prefix);
+ /**
+ * build a monmap from a list of hostnames
+ *
+ * Give mons dummy names.
+ *
+ * @param hosts list of ips, space or comma separated
+ * @param prefix prefix to prepend to generated mon names
+ * @return 0 for success, -errno on error
+ */
+ int init_with_hosts(const std::string& hostlist,
+ bool for_mkfs,
+ std::string_view prefix);
+ int init_with_config_file(const ConfigProxy& conf, std::ostream& errout);
+#if WITH_SEASTAR
+ seastar::future<> read_monmap(const std::string& monmap);
+ /// try to build monmap with different settings, like
+ /// mon_host, mon* sections, and mon_dns_srv_name
+ seastar::future<> build_monmap(const crimson::common::ConfigProxy& conf, bool for_mkfs);
+ /// initialize monmap by resolving given service name
+ seastar::future<> init_with_dns_srv(bool for_mkfs, const std::string& name);
+ /// initialize monmap with `mon_host` or `mon_host_override`
+ bool maybe_init_with_mon_host(const std::string& mon_host, bool for_mkfs);
+#else
+ /// read from encoded monmap file
+ int init_with_monmap(const std::string& monmap, std::ostream& errout);
+ int init_with_dns_srv(CephContext* cct, std::string srv_name, bool for_mkfs,
+ std::ostream& errout);
+#endif
+};
+WRITE_CLASS_ENCODER_FEATURES(MonMap)
+
+inline std::ostream& operator<<(std::ostream &out, const MonMap &m) {
+ m.print_summary(out);
+ return out;
+}
+
+#endif
diff --git a/src/mon/MonOpRequest.h b/src/mon/MonOpRequest.h
new file mode 100644
index 000000000..0c4379910
--- /dev/null
+++ b/src/mon/MonOpRequest.h
@@ -0,0 +1,238 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat <contact@redhat.com>
+ * Copyright (C) 2015 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef MON_OPREQUEST_H_
+#define MON_OPREQUEST_H_
+#include <iosfwd>
+#include <stdint.h>
+
+#include "common/TrackedOp.h"
+#include "mon/Session.h"
+#include "msg/Message.h"
+
+struct MonOpRequest : public TrackedOp {
+ friend class OpTracker;
+
+ void mark_dispatch() {
+ mark_event("monitor_dispatch");
+ }
+ void mark_wait_for_quorum() {
+ mark_event("wait_for_quorum");
+ }
+ void mark_zap() {
+ mark_event("monitor_zap");
+ }
+ void mark_forwarded() {
+ mark_event("forwarded");
+ forwarded_to_leader = true;
+ }
+
+ void mark_svc_event(const std::string &service, const std::string &event) {
+ std::string s = service;
+ s.append(":").append(event);
+ mark_event(s);
+ }
+
+ void mark_logmon_event(const std::string &event) {
+ mark_svc_event("logm", event);
+ }
+ void mark_osdmon_event(const std::string &event) {
+ mark_svc_event("osdmap", event);
+ }
+ void mark_pgmon_event(const std::string &event) {
+ mark_svc_event("pgmap", event);
+ }
+ void mark_mdsmon_event(const std::string &event) {
+ mark_svc_event("mdsmap", event);
+ }
+ void mark_authmon_event(const std::string &event) {
+ mark_svc_event("auth", event);
+ }
+ void mark_paxos_event(const std::string &event) {
+ mark_svc_event("paxos", event);
+ }
+
+
+ enum op_type_t {
+ OP_TYPE_NONE = 0, ///< no type defined (default)
+ OP_TYPE_SERVICE, ///< belongs to a Paxos Service or similar
+ OP_TYPE_MONITOR, ///< belongs to the Monitor class
+ OP_TYPE_ELECTION, ///< belongs to the Elector class
+ OP_TYPE_PAXOS, ///< refers to Paxos messages
+ OP_TYPE_COMMAND, ///< is a command
+ };
+
+ MonOpRequest(const MonOpRequest &other) = delete;
+ MonOpRequest & operator = (const MonOpRequest &other) = delete;
+
+private:
+ Message *request;
+ utime_t dequeued_time;
+ RefCountedPtr session;
+ ConnectionRef con;
+ bool forwarded_to_leader;
+ op_type_t op_type;
+
+ MonOpRequest(Message *req, OpTracker *tracker) :
+ TrackedOp(tracker,
+ req->get_recv_stamp().is_zero() ?
+ ceph_clock_now() : req->get_recv_stamp()),
+ request(req),
+ con(NULL),
+ forwarded_to_leader(false),
+ op_type(OP_TYPE_NONE)
+ {
+ if (req) {
+ con = req->get_connection();
+ if (con) {
+ session = con->get_priv();
+ }
+ }
+ }
+
+ void _dump(ceph::Formatter *f) const override {
+ {
+ f->open_array_section("events");
+ std::lock_guard l(lock);
+ for (auto i = events.begin(); i != events.end(); ++i) {
+ f->open_object_section("event");
+ f->dump_string("event", i->str);
+ f->dump_stream("time") << i->stamp;
+
+ auto i_next = i + 1;
+
+ if (i_next < events.end()) {
+ f->dump_float("duration", i_next->stamp - i->stamp);
+ } else {
+ f->dump_float("duration", events.rbegin()->stamp - get_initiated());
+ }
+
+ f->close_section();
+ }
+ f->close_section();
+ f->open_object_section("info");
+ f->dump_int("seq", seq);
+ f->dump_bool("src_is_mon", is_src_mon());
+ f->dump_stream("source") << request->get_source_inst();
+ f->dump_bool("forwarded_to_leader", forwarded_to_leader);
+ f->close_section();
+ }
+ }
+
+protected:
+ void _dump_op_descriptor(std::ostream& stream) const override {
+ get_req()->print(stream);
+ }
+
+public:
+ ~MonOpRequest() override {
+ request->put();
+ }
+
+ MonSession *get_session() const {
+ return static_cast<MonSession*>(session.get());
+ }
+
+ template<class T>
+ T *get_req() const { return static_cast<T*>(request); }
+
+ Message *get_req() const { return get_req<Message>(); }
+
+ int get_req_type() const {
+ if (!request)
+ return 0;
+ return request->get_type();
+ }
+
+ ConnectionRef get_connection() { return con; }
+
+ void set_session(MonSession *s) {
+ session.reset(s);
+ }
+
+ bool is_src_mon() const {
+ return (con && con->get_peer_type() & CEPH_ENTITY_TYPE_MON);
+ }
+
+ typedef boost::intrusive_ptr<MonOpRequest> Ref;
+
+ void set_op_type(op_type_t t) {
+ op_type = t;
+ }
+ void set_type_service() {
+ set_op_type(OP_TYPE_SERVICE);
+ }
+ void set_type_monitor() {
+ set_op_type(OP_TYPE_MONITOR);
+ }
+ void set_type_paxos() {
+ set_op_type(OP_TYPE_PAXOS);
+ }
+ void set_type_election_or_ping() {
+ set_op_type(OP_TYPE_ELECTION);
+ }
+ void set_type_command() {
+ set_op_type(OP_TYPE_COMMAND);
+ }
+
+ op_type_t get_op_type() {
+ return op_type;
+ }
+
+ bool is_type_service() {
+ return (get_op_type() == OP_TYPE_SERVICE);
+ }
+ bool is_type_monitor() {
+ return (get_op_type() == OP_TYPE_MONITOR);
+ }
+ bool is_type_paxos() {
+ return (get_op_type() == OP_TYPE_PAXOS);
+ }
+ bool is_type_election_or_ping() {
+ return (get_op_type() == OP_TYPE_ELECTION);
+ }
+ bool is_type_command() {
+ return (get_op_type() == OP_TYPE_COMMAND);
+ }
+};
+
+typedef MonOpRequest::Ref MonOpRequestRef;
+
+struct C_MonOp : public Context
+{
+ MonOpRequestRef op;
+
+ explicit C_MonOp(MonOpRequestRef o) :
+ op(o) { }
+
+ void finish(int r) override {
+ if (op && r == -ECANCELED) {
+ op->mark_event("callback canceled");
+ } else if (op && r == -EAGAIN) {
+ op->mark_event("callback retry");
+ } else if (op && r == 0) {
+ op->mark_event("callback finished");
+ }
+ _finish(r);
+ }
+
+ void mark_op_event(const std::string &event) {
+ if (op)
+ op->mark_event(event);
+ }
+
+ virtual void _finish(int r) = 0;
+};
+
+#endif /* MON_OPREQUEST_H_ */
diff --git a/src/mon/MonSub.cc b/src/mon/MonSub.cc
new file mode 100644
index 000000000..a2c60ba91
--- /dev/null
+++ b/src/mon/MonSub.cc
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "MonSub.h"
+
+bool MonSub::have_new() const {
+ return !sub_new.empty();
+}
+
+bool MonSub::need_renew() const
+{
+ return ceph::coarse_mono_clock::now() > renew_after;
+}
+
+void MonSub::renewed()
+{
+ if (clock::is_zero(renew_sent)) {
+ renew_sent = clock::now();
+ }
+ // update sub_sent with sub_new
+ sub_new.insert(sub_sent.begin(), sub_sent.end());
+ std::swap(sub_new, sub_sent);
+ sub_new.clear();
+}
+
+void MonSub::acked(uint32_t interval)
+{
+ if (!clock::is_zero(renew_sent)) {
+ // NOTE: this is only needed for legacy (infernalis or older)
+ // mons; see MonClient::tick().
+ renew_after = renew_sent;
+ renew_after += ceph::make_timespan(interval / 2.0);
+ renew_sent = clock::zero();
+ }
+}
+
+bool MonSub::reload()
+{
+ for (auto& [what, sub] : sub_sent) {
+ if (sub_new.count(what) == 0) {
+ sub_new[what] = sub;
+ }
+ }
+ return have_new();
+}
+
+void MonSub::got(const std::string& what, version_t have)
+{
+ if (auto i = sub_new.find(what); i != sub_new.end()) {
+ auto& sub = i->second;
+ if (sub.start <= have) {
+ if (sub.flags & CEPH_SUBSCRIBE_ONETIME) {
+ sub_new.erase(i);
+ } else {
+ sub.start = have + 1;
+ }
+ }
+ } else if (auto i = sub_sent.find(what); i != sub_sent.end()) {
+ auto& sub = i->second;
+ if (sub.start <= have) {
+ if (sub.flags & CEPH_SUBSCRIBE_ONETIME) {
+ sub_sent.erase(i);
+ } else {
+ sub.start = have + 1;
+ }
+ }
+ }
+}
+
+bool MonSub::want(const std::string& what, version_t start, unsigned flags)
+{
+ if (auto sub = sub_new.find(what);
+ sub != sub_new.end() &&
+ sub->second.start == start &&
+ sub->second.flags == flags) {
+ return false;
+ } else if (auto sub = sub_sent.find(what);
+ sub != sub_sent.end() &&
+ sub->second.start == start &&
+ sub->second.flags == flags) {
+ return false;
+ } else {
+ sub_new[what].start = start;
+ sub_new[what].flags = flags;
+ return true;
+ }
+}
+
+bool MonSub::inc_want(const std::string& what, version_t start, unsigned flags)
+{
+ if (auto sub = sub_new.find(what); sub != sub_new.end()) {
+ if (sub->second.start >= start) {
+ return false;
+ } else {
+ sub->second.start = start;
+ sub->second.flags = flags;
+ return true;
+ }
+ } else if (auto sub = sub_sent.find(what);
+ sub == sub_sent.end() || sub->second.start < start) {
+ auto& item = sub_new[what];
+ item.start = start;
+ item.flags = flags;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+void MonSub::unwant(const std::string& what)
+{
+ sub_sent.erase(what);
+ sub_new.erase(what);
+}
diff --git a/src/mon/MonSub.h b/src/mon/MonSub.h
new file mode 100644
index 000000000..8ff5a8f18
--- /dev/null
+++ b/src/mon/MonSub.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "common/ceph_time.h"
+#include "include/types.h"
+
+// mon subscriptions
+class MonSub
+{
+public:
+ // @returns true if there is any "new" subscriptions
+ bool have_new() const;
+ auto get_subs() const {
+ return sub_new;
+ }
+ bool need_renew() const;
+ // change the status of "new" subscriptions to "sent"
+ void renewed();
+ // the peer acked the subscription request
+ void acked(uint32_t interval);
+ void got(const std::string& what, version_t version);
+ // revert the status of subscriptions from "sent" to "new"
+ // @returns true if there is any pending "new" subscriptions
+ bool reload();
+ // add a new subscription
+ bool want(const std::string& what, version_t start, unsigned flags);
+ // increment the requested subscription start point. If you do increase
+ // the value, apply the passed-in flags as well; otherwise do nothing.
+ bool inc_want(const std::string& what, version_t start, unsigned flags);
+ // cancel a subscription
+ void unwant(const std::string& what);
+private:
+ // my subs, and current versions
+ std::map<std::string,ceph_mon_subscribe_item> sub_sent;
+ // unsent new subs
+ std::map<std::string,ceph_mon_subscribe_item> sub_new;
+ using time_point = ceph::coarse_mono_time;
+ using clock = typename time_point::clock;
+ time_point renew_sent;
+ time_point renew_after;
+};
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
new file mode 100644
index 000000000..05748ac4f
--- /dev/null
+++ b/src/mon/Monitor.cc
@@ -0,0 +1,6961 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include <iterator>
+#include <sstream>
+#include <tuple>
+#include <stdlib.h>
+#include <signal.h>
+#include <limits.h>
+#include <cstring>
+#include <boost/scope_exit.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "json_spirit/json_spirit_reader.h"
+#include "json_spirit/json_spirit_writer.h"
+
+#include "Monitor.h"
+#include "common/version.h"
+#include "common/blkdev.h"
+#include "common/cmdparse.h"
+#include "common/signal.h"
+
+#include "osd/OSDMap.h"
+
+#include "MonitorDBStore.h"
+
+#include "messages/PaxosServiceMessage.h"
+#include "messages/MMonMap.h"
+#include "messages/MMonGetMap.h"
+#include "messages/MMonGetVersion.h"
+#include "messages/MMonGetVersionReply.h"
+#include "messages/MGenericMessage.h"
+#include "messages/MMonCommand.h"
+#include "messages/MMonCommandAck.h"
+#include "messages/MMonSync.h"
+#include "messages/MMonScrub.h"
+#include "messages/MMonProbe.h"
+#include "messages/MMonJoin.h"
+#include "messages/MMonPaxos.h"
+#include "messages/MRoute.h"
+#include "messages/MForward.h"
+
+#include "messages/MMonSubscribe.h"
+#include "messages/MMonSubscribeAck.h"
+
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+
+#include "messages/MTimeCheck2.h"
+#include "messages/MPing.h"
+
+#include "common/strtol.h"
+#include "common/ceph_argparse.h"
+#include "common/Timer.h"
+#include "common/Clock.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "common/admin_socket.h"
+#include "global/signal_handler.h"
+#include "common/Formatter.h"
+#include "include/stringify.h"
+#include "include/color.h"
+#include "include/ceph_fs.h"
+#include "include/str_list.h"
+
+#include "OSDMonitor.h"
+#include "MDSMonitor.h"
+#include "MonmapMonitor.h"
+#include "LogMonitor.h"
+#include "AuthMonitor.h"
+#include "MgrMonitor.h"
+#include "MgrStatMonitor.h"
+#include "ConfigMonitor.h"
+#include "KVMonitor.h"
+#include "mon/HealthMonitor.h"
+#include "common/config.h"
+#include "common/cmdparse.h"
+#include "include/ceph_assert.h"
+#include "include/compat.h"
+#include "perfglue/heap_profiler.h"
+
+#include "auth/none/AuthNoneClientHandler.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+using namespace TOPNSPC::common;
+
+using std::cout;
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::setfill;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+using std::unique_ptr;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::ErasureCodeInterfaceRef;
+using ceph::ErasureCodeProfile;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_message;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+
+
+static ostream& _prefix(std::ostream *_dout, const Monitor *mon) {
+ return *_dout << "mon." << mon->name << "@" << mon->rank
+ << "(" << mon->get_state_name() << ") e" << mon->monmap->get_epoch() << " ";
+}
+
+const string Monitor::MONITOR_NAME = "monitor";
+const string Monitor::MONITOR_STORE_PREFIX = "monitor_store";
+
+
+#undef FLAG
+#undef COMMAND
+#undef COMMAND_WITH_FLAG
+#define FLAG(f) (MonCommand::FLAG_##f)
+#define COMMAND(parsesig, helptext, modulename, req_perms) \
+ {parsesig, helptext, modulename, req_perms, FLAG(NONE)},
+#define COMMAND_WITH_FLAG(parsesig, helptext, modulename, req_perms, flags) \
+ {parsesig, helptext, modulename, req_perms, flags},
+MonCommand mon_commands[] = {
+#include <mon/MonCommands.h>
+};
+#undef COMMAND
+#undef COMMAND_WITH_FLAG
+
+Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s,
+ Messenger *m, Messenger *mgr_m, MonMap *map) :
+ Dispatcher(cct_),
+ AuthServer(cct_),
+ name(nm),
+ rank(-1),
+ messenger(m),
+ con_self(m ? m->get_loopback_connection() : NULL),
+ timer(cct_, lock),
+ finisher(cct_, "mon_finisher", "fin"),
+ cpu_tp(cct, "Monitor::cpu_tp", "cpu_tp", g_conf()->mon_cpu_threads),
+ has_ever_joined(false),
+ logger(NULL), cluster_logger(NULL), cluster_logger_registered(false),
+ monmap(map),
+ log_client(cct_, messenger, monmap, LogClient::FLAG_MON),
+ key_server(cct, &keyring),
+ auth_cluster_required(cct,
+ cct->_conf->auth_supported.empty() ?
+ cct->_conf->auth_cluster_required : cct->_conf->auth_supported),
+ auth_service_required(cct,
+ cct->_conf->auth_supported.empty() ?
+ cct->_conf->auth_service_required : cct->_conf->auth_supported),
+ mgr_messenger(mgr_m),
+ mgr_client(cct_, mgr_m, monmap),
+ gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
+ store(s),
+
+ elector(this, map->strategy),
+ required_features(0),
+ leader(0),
+ quorum_con_features(0),
+ // scrub
+ scrub_version(0),
+ scrub_event(NULL),
+ scrub_timeout_event(NULL),
+
+ // sync state
+ sync_provider_count(0),
+ sync_cookie(0),
+ sync_full(false),
+ sync_start_version(0),
+ sync_timeout_event(NULL),
+ sync_last_committed_floor(0),
+
+ timecheck_round(0),
+ timecheck_acks(0),
+ timecheck_rounds_since_clean(0),
+ timecheck_event(NULL),
+
+ admin_hook(NULL),
+ routed_request_tid(0),
+ op_tracker(cct, g_conf().get_val<bool>("mon_enable_op_tracker"), 1)
+{
+ clog = log_client.create_channel(CLOG_CHANNEL_CLUSTER);
+ audit_clog = log_client.create_channel(CLOG_CHANNEL_AUDIT);
+
+ update_log_clients();
+
+ if (!gss_ktfile_client.empty()) {
+ // Assert we can export environment variable
+ /*
+ The default client keytab is used, if it is present and readable,
+ to automatically obtain initial credentials for GSSAPI client
+ applications. The principal name of the first entry in the client
+ keytab is used by default when obtaining initial credentials.
+ 1. The KRB5_CLIENT_KTNAME environment variable.
+ 2. The default_client_keytab_name profile variable in [libdefaults].
+ 3. The hardcoded default, DEFCKTNAME.
+ */
+ const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
+ gss_ktfile_client.c_str(), 1));
+ ceph_assert(set_result == 0);
+ }
+
+ op_tracker.set_complaint_and_threshold(
+ g_conf().get_val<std::chrono::seconds>("mon_op_complaint_time").count(),
+ g_conf().get_val<int64_t>("mon_op_log_threshold"));
+ op_tracker.set_history_size_and_duration(
+ g_conf().get_val<uint64_t>("mon_op_history_size"),
+ g_conf().get_val<std::chrono::seconds>("mon_op_history_duration").count());
+ op_tracker.set_history_slow_op_size_and_threshold(
+ g_conf().get_val<uint64_t>("mon_op_history_slow_op_size"),
+ g_conf().get_val<std::chrono::seconds>("mon_op_history_slow_op_threshold").count());
+
+ paxos = std::make_unique<Paxos>(*this, "paxos");
+
+ paxos_service[PAXOS_MDSMAP].reset(new MDSMonitor(*this, *paxos, "mdsmap"));
+ paxos_service[PAXOS_MONMAP].reset(new MonmapMonitor(*this, *paxos, "monmap"));
+ paxos_service[PAXOS_OSDMAP].reset(new OSDMonitor(cct, *this, *paxos, "osdmap"));
+ paxos_service[PAXOS_LOG].reset(new LogMonitor(*this, *paxos, "logm"));
+ paxos_service[PAXOS_AUTH].reset(new AuthMonitor(*this, *paxos, "auth"));
+ paxos_service[PAXOS_MGR].reset(new MgrMonitor(*this, *paxos, "mgr"));
+ paxos_service[PAXOS_MGRSTAT].reset(new MgrStatMonitor(*this, *paxos, "mgrstat"));
+ paxos_service[PAXOS_HEALTH].reset(new HealthMonitor(*this, *paxos, "health"));
+ paxos_service[PAXOS_CONFIG].reset(new ConfigMonitor(*this, *paxos, "config"));
+ paxos_service[PAXOS_KV].reset(new KVMonitor(*this, *paxos, "kv"));
+
+ bool r = mon_caps.parse("allow *", NULL);
+ ceph_assert(r);
+
+ exited_quorum = ceph_clock_now();
+
+ // prepare local commands
+ local_mon_commands.resize(std::size(mon_commands));
+ for (unsigned i = 0; i < std::size(mon_commands); ++i) {
+ local_mon_commands[i] = mon_commands[i];
+ }
+ MonCommand::encode_vector(local_mon_commands, local_mon_commands_bl);
+
+ prenautilus_local_mon_commands = local_mon_commands;
+ for (auto& i : prenautilus_local_mon_commands) {
+ std::string n = cmddesc_get_prenautilus_compat(i.cmdstring);
+ if (n != i.cmdstring) {
+ dout(20) << " pre-nautilus cmd " << i.cmdstring << " -> " << n << dendl;
+ i.cmdstring = n;
+ }
+ }
+ MonCommand::encode_vector(prenautilus_local_mon_commands, prenautilus_local_mon_commands_bl);
+
+ // assume our commands until we have an election. this only means
+ // we won't reply with EINVAL before the election; any command that
+ // actually matters will wait until we have quorum etc and then
+ // retry (and revalidate).
+ leader_mon_commands = local_mon_commands;
+}
+
+Monitor::~Monitor()
+{
+ op_tracker.on_shutdown();
+ delete logger;
+ ceph_assert(session_map.sessions.empty());
+}
+
+
+class AdminHook : public AdminSocketHook {
+ Monitor *mon;
+public:
+ explicit AdminHook(Monitor *m) : mon(m) {}
+ int call(std::string_view command, const cmdmap_t& cmdmap,
+ const bufferlist&,
+ Formatter *f,
+ std::ostream& errss,
+ bufferlist& out) override {
+ stringstream outss;
+ int r = mon->do_admin_command(command, cmdmap, f, errss, outss);
+ out.append(outss);
+ return r;
+ }
+};
+
+int Monitor::do_admin_command(
+ std::string_view command,
+ const cmdmap_t& cmdmap,
+ Formatter *f,
+ std::ostream& err,
+ std::ostream& out)
+{
+ std::lock_guard l(lock);
+
+ int r = 0;
+ string args;
+ for (auto p = cmdmap.begin();
+ p != cmdmap.end(); ++p) {
+ if (p->first == "prefix")
+ continue;
+ if (!args.empty())
+ args += ", ";
+ args += cmd_vartype_stringify(p->second);
+ }
+ args = "[" + args + "]";
+
+ bool read_only = (command == "mon_status" ||
+ command == "mon metadata" ||
+ command == "quorum_status" ||
+ command == "ops" ||
+ command == "sessions");
+
+ (read_only ? audit_clog->debug() : audit_clog->info())
+ << "from='admin socket' entity='admin socket' "
+ << "cmd='" << command << "' args=" << args << ": dispatch";
+
+ if (command == "mon_status") {
+ get_mon_status(f);
+ } else if (command == "quorum_status") {
+ _quorum_status(f, out);
+ } else if (command == "sync_force") {
+ bool validate = false;
+ if (!cmd_getval(cmdmap, "yes_i_really_mean_it", validate)) {
+ std::string v;
+ if (cmd_getval(cmdmap, "validate", v) &&
+ v == "--yes-i-really-mean-it") {
+ validate = true;
+ }
+ }
+ if (!validate) {
+ err << "are you SURE? this will mean the monitor store will be erased "
+ "the next time the monitor is restarted. pass "
+ "'--yes-i-really-mean-it' if you really do.";
+ r = -EPERM;
+ goto abort;
+ }
+ sync_force(f);
+ } else if (command.compare(0, 23, "add_bootstrap_peer_hint") == 0 ||
+ command.compare(0, 24, "add_bootstrap_peer_hintv") == 0) {
+ if (!_add_bootstrap_peer_hint(command, cmdmap, out))
+ goto abort;
+ } else if (command == "quorum enter") {
+ elector.start_participating();
+ start_election();
+ out << "started responding to quorum, initiated new election";
+ } else if (command == "quorum exit") {
+ start_election();
+ elector.stop_participating();
+ out << "stopped responding to quorum, initiated new election";
+ } else if (command == "ops") {
+ (void)op_tracker.dump_ops_in_flight(f);
+ } else if (command == "sessions") {
+ f->open_array_section("sessions");
+ for (auto p : session_map.sessions) {
+ f->dump_object("session", *p);
+ }
+ f->close_section();
+ } else if (command == "dump_historic_ops") {
+ if (!op_tracker.dump_historic_ops(f)) {
+ err << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
+ please enable \"mon_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
+ }
+ } else if (command == "dump_historic_ops_by_duration" ) {
+ if (op_tracker.dump_historic_ops(f, true)) {
+ err << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
+ please enable \"mon_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
+ }
+ } else if (command == "dump_historic_slow_ops") {
+ if (op_tracker.dump_historic_slow_ops(f, {})) {
+ err << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
+ please enable \"mon_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
+ }
+ } else if (command == "quorum") {
+ string quorumcmd;
+ cmd_getval(cmdmap, "quorumcmd", quorumcmd);
+ if (quorumcmd == "exit") {
+ start_election();
+ elector.stop_participating();
+ out << "stopped responding to quorum, initiated new election" << std::endl;
+ } else if (quorumcmd == "enter") {
+ elector.start_participating();
+ start_election();
+ out << "started responding to quorum, initiated new election" << std::endl;
+ } else {
+ err << "needs a valid 'quorum' command" << std::endl;
+ }
+ } else if (command == "connection scores dump") {
+ if (!get_quorum_mon_features().contains_all(
+ ceph::features::mon::FEATURE_PINGING)) {
+ err << "Not all monitors support changing election strategies; \
+ please upgrade them first!";
+ }
+ elector.dump_connection_scores(f);
+ } else if (command == "connection scores reset") {
+ if (!get_quorum_mon_features().contains_all(
+ ceph::features::mon::FEATURE_PINGING)) {
+ err << "Not all monitors support changing election strategies; \
+ please upgrade them first!";
+ }
+ elector.notify_clear_peer_state();
+ } else if (command == "smart") {
+ string want_devid;
+ cmd_getval(cmdmap, "devid", want_devid);
+
+ string devname = store->get_devname();
+ if (devname.empty()) {
+ err << "could not determine device name for " << store->get_path();
+ r = -ENOENT;
+ goto abort;
+ }
+ set<string> devnames;
+ get_raw_devices(devname, &devnames);
+ json_spirit::mObject json_map;
+ uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
+ "mon_smart_report_timeout");
+ for (auto& devname : devnames) {
+ string err;
+ string devid = get_device_id(devname, &err);
+ if (want_devid.size() && want_devid != devid) {
+ derr << "get_device_id failed on " << devname << ": " << err << dendl;
+ continue;
+ }
+ json_spirit::mValue smart_json;
+ if (block_device_get_metrics(devname, smart_timeout,
+ &smart_json)) {
+ dout(10) << "block_device_get_metrics failed for /dev/" << devname
+ << dendl;
+ continue;
+ }
+ json_map[devid] = smart_json;
+ }
+ json_spirit::write(json_map, out, json_spirit::pretty_print);
+ } else if (command == "heap") {
+ if (!ceph_using_tcmalloc()) {
+ err << "could not issue heap profiler command -- not using tcmalloc!";
+ r = -EOPNOTSUPP;
+ goto abort;
+ }
+ string cmd;
+ if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
+ err << "unable to get value for command \"" << cmd << "\"";
+ r = -EINVAL;
+ goto abort;
+ }
+ std::vector<std::string> cmd_vec;
+ get_str_vec(cmd, cmd_vec);
+ string val;
+ if (cmd_getval(cmdmap, "value", val)) {
+ cmd_vec.push_back(val);
+ }
+ ceph_heap_profiler_handle_command(cmd_vec, out);
+ } else if (command == "compact") {
+ dout(1) << "triggering manual compaction" << dendl;
+ auto start = ceph::coarse_mono_clock::now();
+ store->compact_async();
+ auto end = ceph::coarse_mono_clock::now();
+ auto duration = ceph::to_seconds<double>(end - start);
+ dout(1) << "finished manual compaction in "
+ << duration << " seconds" << dendl;
+ out << "compacted " << g_conf().get_val<std::string>("mon_keyvaluedb")
+ << " in " << duration << " seconds";
+ } else {
+ ceph_abort_msg("bad AdminSocket command binding");
+ }
+ (read_only ? audit_clog->debug() : audit_clog->info())
+ << "from='admin socket' "
+ << "entity='admin socket' "
+ << "cmd=" << command << " "
+ << "args=" << args << ": finished";
+ return r;
+
+abort:
+ (read_only ? audit_clog->debug() : audit_clog->info())
+ << "from='admin socket' "
+ << "entity='admin socket' "
+ << "cmd=" << command << " "
+ << "args=" << args << ": aborted";
+ return r;
+}
+
+void Monitor::handle_signal(int signum)
+{
+ derr << "*** Got Signal " << sig_str(signum) << " ***" << dendl;
+ if (signum == SIGHUP) {
+ sighup_handler(signum);
+ logmon()->reopen_logs();
+ } else {
+ ceph_assert(signum == SIGINT || signum == SIGTERM);
+ shutdown();
+ }
+}
+
+CompatSet Monitor::get_initial_supported_features()
+{
+ CompatSet::FeatureSet ceph_mon_feature_compat;
+ CompatSet::FeatureSet ceph_mon_feature_ro_compat;
+ CompatSet::FeatureSet ceph_mon_feature_incompat;
+ ceph_mon_feature_incompat.insert(CEPH_MON_FEATURE_INCOMPAT_BASE);
+ ceph_mon_feature_incompat.insert(CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS);
+ return CompatSet(ceph_mon_feature_compat, ceph_mon_feature_ro_compat,
+ ceph_mon_feature_incompat);
+}
+
+CompatSet Monitor::get_supported_features()
+{
+ CompatSet compat = get_initial_supported_features();
+ compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES);
+ compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC);
+ compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2);
+ compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3);
+ compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_KRAKEN);
+ compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_LUMINOUS);
+ compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_MIMIC);
+ compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_NAUTILUS);
+ compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OCTOPUS);
+ compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_PACIFIC);
+ compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_QUINCY);
+ compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_REEF);
+ compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_SQUID);
+ return compat;
+}
+
+CompatSet Monitor::get_legacy_features()
+{
+ CompatSet::FeatureSet ceph_mon_feature_compat;
+ CompatSet::FeatureSet ceph_mon_feature_ro_compat;
+ CompatSet::FeatureSet ceph_mon_feature_incompat;
+ ceph_mon_feature_incompat.insert(CEPH_MON_FEATURE_INCOMPAT_BASE);
+ return CompatSet(ceph_mon_feature_compat, ceph_mon_feature_ro_compat,
+ ceph_mon_feature_incompat);
+}
+
+int Monitor::check_features(MonitorDBStore *store)
+{
+ CompatSet required = get_supported_features();
+ CompatSet ondisk;
+
+ read_features_off_disk(store, &ondisk);
+
+ if (!required.writeable(ondisk)) {
+ CompatSet diff = required.unsupported(ondisk);
+ generic_derr << "ERROR: on disk data includes unsupported features: " << diff << dendl;
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+void Monitor::read_features_off_disk(MonitorDBStore *store, CompatSet *features)
+{
+ bufferlist featuresbl;
+ store->get(MONITOR_NAME, COMPAT_SET_LOC, featuresbl);
+ if (featuresbl.length() == 0) {
+ generic_dout(0) << "WARNING: mon fs missing feature list.\n"
+ << "Assuming it is old-style and introducing one." << dendl;
+ //we only want the baseline ~v.18 features assumed to be on disk.
+ //If new features are introduced this code needs to disappear or
+ //be made smarter.
+ *features = get_legacy_features();
+
+ features->encode(featuresbl);
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ t->put(MONITOR_NAME, COMPAT_SET_LOC, featuresbl);
+ store->apply_transaction(t);
+ } else {
+ auto it = featuresbl.cbegin();
+ features->decode(it);
+ }
+}
+
+void Monitor::read_features()
+{
+ read_features_off_disk(store, &features);
+ dout(10) << "features " << features << dendl;
+
+ calc_quorum_requirements();
+ dout(10) << "required_features " << required_features << dendl;
+}
+
+void Monitor::write_features(MonitorDBStore::TransactionRef t)
+{
+ bufferlist bl;
+ features.encode(bl);
+ t->put(MONITOR_NAME, COMPAT_SET_LOC, bl);
+}
+
+const char** Monitor::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "crushtool", // helpful for testing
+ "mon_election_timeout",
+ "mon_lease",
+ "mon_lease_renew_interval_factor",
+ "mon_lease_ack_timeout_factor",
+ "mon_accept_timeout_factor",
+ // clog & admin clog
+ "clog_to_monitors",
+ "clog_to_syslog",
+ "clog_to_syslog_facility",
+ "clog_to_syslog_level",
+ "clog_to_graylog",
+ "clog_to_graylog_host",
+ "clog_to_graylog_port",
+ "mon_cluster_log_to_file",
+ "host",
+ "fsid",
+ // periodic health to clog
+ "mon_health_to_clog",
+ "mon_health_to_clog_interval",
+ "mon_health_to_clog_tick_interval",
+ // scrub interval
+ "mon_scrub_interval",
+ "mon_allow_pool_delete",
+ // osdmap pruning - observed, not handled.
+ "mon_osdmap_full_prune_enabled",
+ "mon_osdmap_full_prune_min",
+ "mon_osdmap_full_prune_interval",
+ "mon_osdmap_full_prune_txsize",
+ // debug options - observed, not handled
+ "mon_debug_extra_checks",
+ "mon_debug_block_osdmap_trim",
+ NULL
+ };
+ return KEYS;
+}
+
+void Monitor::handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed)
+{
+ sanitize_options();
+
+ dout(10) << __func__ << " " << changed << dendl;
+
+ if (changed.count("clog_to_monitors") ||
+ changed.count("clog_to_syslog") ||
+ changed.count("clog_to_syslog_level") ||
+ changed.count("clog_to_syslog_facility") ||
+ changed.count("clog_to_graylog") ||
+ changed.count("clog_to_graylog_host") ||
+ changed.count("clog_to_graylog_port") ||
+ changed.count("host") ||
+ changed.count("fsid")) {
+ update_log_clients();
+ }
+
+ if (changed.count("mon_health_to_clog") ||
+ changed.count("mon_health_to_clog_interval") ||
+ changed.count("mon_health_to_clog_tick_interval")) {
+ finisher.queue(new C_MonContext{this, [this, changed](int) {
+ std::lock_guard l{lock};
+ health_to_clog_update_conf(changed);
+ }});
+ }
+
+ if (changed.count("mon_scrub_interval")) {
+ auto scrub_interval =
+ conf.get_val<std::chrono::seconds>("mon_scrub_interval");
+ finisher.queue(new C_MonContext{this, [this, scrub_interval](int) {
+ std::lock_guard l{lock};
+ scrub_update_interval(scrub_interval);
+ }});
+ }
+}
+
+void Monitor::update_log_clients()
+{
+ clog->parse_client_options(g_ceph_context);
+ audit_clog->parse_client_options(g_ceph_context);
+}
+
+int Monitor::sanitize_options()
+{
+ int r = 0;
+
+ // mon_lease must be greater than mon_lease_renewal; otherwise we
+ // may incur in leases expiring before they are renewed.
+ if (g_conf()->mon_lease_renew_interval_factor >= 1.0) {
+ clog->error() << "mon_lease_renew_interval_factor ("
+ << g_conf()->mon_lease_renew_interval_factor
+ << ") must be less than 1.0";
+ r = -EINVAL;
+ }
+
+ // mon_lease_ack_timeout must be greater than mon_lease to make sure we've
+ // got time to renew the lease and get an ack for it. Having both options
+ // with the same value, for a given small vale, could mean timing out if
+ // the monitors happened to be overloaded -- or even under normal load for
+ // a small enough value.
+ if (g_conf()->mon_lease_ack_timeout_factor <= 1.0) {
+ clog->error() << "mon_lease_ack_timeout_factor ("
+ << g_conf()->mon_lease_ack_timeout_factor
+ << ") must be greater than 1.0";
+ r = -EINVAL;
+ }
+
+ return r;
+}
+
+int Monitor::preinit()
+{
+ std::unique_lock l(lock);
+
+ dout(1) << "preinit fsid " << monmap->fsid << dendl;
+
+ int r = sanitize_options();
+ if (r < 0) {
+ derr << "option sanitization failed!" << dendl;
+ return r;
+ }
+
+ ceph_assert(!logger);
+ {
+ PerfCountersBuilder pcb(g_ceph_context, "mon", l_mon_first, l_mon_last);
+ pcb.add_u64(l_mon_num_sessions, "num_sessions", "Open sessions", "sess",
+ PerfCountersBuilder::PRIO_USEFUL);
+ pcb.add_u64_counter(l_mon_session_add, "session_add", "Created sessions",
+ "sadd", PerfCountersBuilder::PRIO_INTERESTING);
+ pcb.add_u64_counter(l_mon_session_rm, "session_rm", "Removed sessions",
+ "srm", PerfCountersBuilder::PRIO_INTERESTING);
+ pcb.add_u64_counter(l_mon_session_trim, "session_trim", "Trimmed sessions",
+ "strm", PerfCountersBuilder::PRIO_USEFUL);
+ pcb.add_u64_counter(l_mon_num_elections, "num_elections", "Elections participated in",
+ "ecnt", PerfCountersBuilder::PRIO_USEFUL);
+ pcb.add_u64_counter(l_mon_election_call, "election_call", "Elections started",
+ "estt", PerfCountersBuilder::PRIO_INTERESTING);
+ pcb.add_u64_counter(l_mon_election_win, "election_win", "Elections won",
+ "ewon", PerfCountersBuilder::PRIO_INTERESTING);
+ pcb.add_u64_counter(l_mon_election_lose, "election_lose", "Elections lost",
+ "elst", PerfCountersBuilder::PRIO_INTERESTING);
+ logger = pcb.create_perf_counters();
+ cct->get_perfcounters_collection()->add(logger);
+ }
+
+ ceph_assert(!cluster_logger);
+ {
+ PerfCountersBuilder pcb(g_ceph_context, "cluster", l_cluster_first, l_cluster_last);
+ pcb.add_u64(l_cluster_num_mon, "num_mon", "Monitors");
+ pcb.add_u64(l_cluster_num_mon_quorum, "num_mon_quorum", "Monitors in quorum");
+ pcb.add_u64(l_cluster_num_osd, "num_osd", "OSDs");
+ pcb.add_u64(l_cluster_num_osd_up, "num_osd_up", "OSDs that are up");
+ pcb.add_u64(l_cluster_num_osd_in, "num_osd_in", "OSD in state \"in\" (they are in cluster)");
+ pcb.add_u64(l_cluster_osd_epoch, "osd_epoch", "Current epoch of OSD map");
+ pcb.add_u64(l_cluster_osd_bytes, "osd_bytes", "Total capacity of cluster", NULL, 0, unit_t(UNIT_BYTES));
+ pcb.add_u64(l_cluster_osd_bytes_used, "osd_bytes_used", "Used space", NULL, 0, unit_t(UNIT_BYTES));
+ pcb.add_u64(l_cluster_osd_bytes_avail, "osd_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES));
+ pcb.add_u64(l_cluster_num_pool, "num_pool", "Pools");
+ pcb.add_u64(l_cluster_num_pg, "num_pg", "Placement groups");
+ pcb.add_u64(l_cluster_num_pg_active_clean, "num_pg_active_clean", "Placement groups in active+clean state");
+ pcb.add_u64(l_cluster_num_pg_active, "num_pg_active", "Placement groups in active state");
+ pcb.add_u64(l_cluster_num_pg_peering, "num_pg_peering", "Placement groups in peering state");
+ pcb.add_u64(l_cluster_num_object, "num_object", "Objects");
+ pcb.add_u64(l_cluster_num_object_degraded, "num_object_degraded", "Degraded (missing replicas) objects");
+ pcb.add_u64(l_cluster_num_object_misplaced, "num_object_misplaced", "Misplaced (wrong location in the cluster) objects");
+ pcb.add_u64(l_cluster_num_object_unfound, "num_object_unfound", "Unfound objects");
+ pcb.add_u64(l_cluster_num_bytes, "num_bytes", "Size of all objects", NULL, 0, unit_t(UNIT_BYTES));
+ cluster_logger = pcb.create_perf_counters();
+ }
+
+ paxos->init_logger();
+
+ // verify cluster_uuid
+ {
+ int r = check_fsid();
+ if (r == -ENOENT)
+ r = write_fsid();
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ // open compatset
+ read_features();
+
+ // have we ever joined a quorum?
+ has_ever_joined = (store->get(MONITOR_NAME, "joined") != 0);
+ dout(10) << "has_ever_joined = " << (int)has_ever_joined << dendl;
+
+ if (!has_ever_joined) {
+ // impose initial quorum restrictions?
+ list<string> initial_members;
+ get_str_list(g_conf()->mon_initial_members, initial_members);
+
+ if (!initial_members.empty()) {
+ dout(1) << " initial_members " << initial_members << ", filtering seed monmap" << dendl;
+
+ monmap->set_initial_members(
+ g_ceph_context, initial_members, name, messenger->get_myaddrs(),
+ &extra_probe_peers);
+
+ dout(10) << " monmap is " << *monmap << dendl;
+ dout(10) << " extra probe peers " << extra_probe_peers << dendl;
+ }
+ } else if (!monmap->contains(name)) {
+ derr << "not in monmap and have been in a quorum before; "
+ << "must have been removed" << dendl;
+ if (g_conf()->mon_force_quorum_join) {
+ dout(0) << "we should have died but "
+ << "'mon_force_quorum_join' is set -- allowing boot" << dendl;
+ } else {
+ derr << "commit suicide!" << dendl;
+ return -ENOENT;
+ }
+ }
+
+ {
+ // We have a potentially inconsistent store state in hands. Get rid of it
+ // and start fresh.
+ bool clear_store = false;
+ if (store->exists("mon_sync", "in_sync")) {
+ dout(1) << __func__ << " clean up potentially inconsistent store state"
+ << dendl;
+ clear_store = true;
+ }
+
+ if (store->get("mon_sync", "force_sync") > 0) {
+ dout(1) << __func__ << " force sync by clearing store state" << dendl;
+ clear_store = true;
+ }
+
+ if (clear_store) {
+ set<string> sync_prefixes = get_sync_targets_names();
+ store->clear(sync_prefixes);
+ }
+ }
+
+ sync_last_committed_floor = store->get("mon_sync", "last_committed_floor");
+ dout(10) << "sync_last_committed_floor " << sync_last_committed_floor << dendl;
+
+ init_paxos();
+
+ if (is_keyring_required()) {
+ // we need to bootstrap authentication keys so we can form an
+ // initial quorum.
+ if (authmon()->get_last_committed() == 0) {
+ dout(10) << "loading initial keyring to bootstrap authentication for mkfs" << dendl;
+ bufferlist bl;
+ int err = store->get("mkfs", "keyring", bl);
+ if (err == 0 && bl.length() > 0) {
+ // Attempt to decode and extract keyring only if it is found.
+ KeyRing keyring;
+ auto p = bl.cbegin();
+ decode(keyring, p);
+ extract_save_mon_key(keyring);
+ }
+ }
+
+ string keyring_loc = g_conf()->mon_data + "/keyring";
+
+ r = keyring.load(cct, keyring_loc);
+ if (r < 0) {
+ EntityName mon_name;
+ mon_name.set_type(CEPH_ENTITY_TYPE_MON);
+ EntityAuth mon_key;
+ if (key_server.get_auth(mon_name, mon_key)) {
+ dout(1) << "copying mon. key from old db to external keyring" << dendl;
+ keyring.add(mon_name, mon_key);
+ bufferlist bl;
+ keyring.encode_plaintext(bl);
+ write_default_keyring(bl);
+ } else {
+ derr << "unable to load initial keyring " << g_conf()->keyring << dendl;
+ return r;
+ }
+ }
+ }
+
+ admin_hook = new AdminHook(this);
+ AdminSocket* admin_socket = cct->get_admin_socket();
+
+ // unlock while registering to avoid mon_lock -> admin socket lock dependency.
+ l.unlock();
+ // register tell/asock commands
+ for (const auto& command : local_mon_commands) {
+ if (!command.is_tell()) {
+ continue;
+ }
+ const auto prefix = cmddesc_get_prefix(command.cmdstring);
+ if (prefix == "injectargs" ||
+ prefix == "version" ||
+ prefix == "tell") {
+ // not registerd by me
+ continue;
+ }
+ r = admin_socket->register_command(command.cmdstring, admin_hook,
+ command.helpstring);
+ ceph_assert(r == 0);
+ }
+ l.lock();
+
+ // add ourselves as a conf observer
+ g_conf().add_observer(this);
+
+ messenger->set_auth_client(this);
+ messenger->set_auth_server(this);
+ mgr_messenger->set_auth_client(this);
+
+ auth_registry.refresh_config();
+
+ return 0;
+}
+
+int Monitor::init()
+{
+ dout(2) << "init" << dendl;
+ std::lock_guard l(lock);
+
+ finisher.start();
+
+ // start ticker
+ timer.init();
+ new_tick();
+
+ cpu_tp.start();
+
+ // i'm ready!
+ messenger->add_dispatcher_tail(this);
+
+ // kickstart pet mgrclient
+ mgr_client.init();
+ mgr_messenger->add_dispatcher_tail(&mgr_client);
+ mgr_messenger->add_dispatcher_tail(this); // for auth ms_* calls
+ mgrmon()->prime_mgr_client();
+
+ state = STATE_PROBING;
+
+ bootstrap();
+
+ if (!elector.peer_tracker_is_clean()){
+ dout(10) << "peer_tracker looks inconsistent"
+ << " previous bad logic, clearing ..." << dendl;
+ elector.notify_clear_peer_state();
+ }
+
+ // add features of myself into feature_map
+ session_map.feature_map.add_mon(con_self->get_features());
+ return 0;
+}
+
+void Monitor::init_paxos()
+{
+ dout(10) << __func__ << dendl;
+ paxos->init();
+
+ // init services
+ for (auto& svc : paxos_service) {
+ svc->init();
+ }
+
+ refresh_from_paxos(NULL);
+}
+
+void Monitor::refresh_from_paxos(bool *need_bootstrap)
+{
+ dout(10) << __func__ << dendl;
+
+ bufferlist bl;
+ int r = store->get(MONITOR_NAME, "cluster_fingerprint", bl);
+ if (r >= 0) {
+ try {
+ auto p = bl.cbegin();
+ decode(fingerprint, p);
+ }
+ catch (ceph::buffer::error& e) {
+ dout(10) << __func__ << " failed to decode cluster_fingerprint" << dendl;
+ }
+ } else {
+ dout(10) << __func__ << " no cluster_fingerprint" << dendl;
+ }
+
+ for (auto& svc : paxos_service) {
+ svc->refresh(need_bootstrap);
+ }
+ for (auto& svc : paxos_service) {
+ svc->post_refresh();
+ }
+ load_metadata();
+}
+
+void Monitor::register_cluster_logger()
+{
+ if (!cluster_logger_registered) {
+ dout(10) << "register_cluster_logger" << dendl;
+ cluster_logger_registered = true;
+ cct->get_perfcounters_collection()->add(cluster_logger);
+ } else {
+ dout(10) << "register_cluster_logger - already registered" << dendl;
+ }
+}
+
+void Monitor::unregister_cluster_logger()
+{
+ if (cluster_logger_registered) {
+ dout(10) << "unregister_cluster_logger" << dendl;
+ cluster_logger_registered = false;
+ cct->get_perfcounters_collection()->remove(cluster_logger);
+ } else {
+ dout(10) << "unregister_cluster_logger - not registered" << dendl;
+ }
+}
+
+void Monitor::update_logger()
+{
+ cluster_logger->set(l_cluster_num_mon, monmap->size());
+ cluster_logger->set(l_cluster_num_mon_quorum, quorum.size());
+}
+
+void Monitor::shutdown()
+{
+ dout(1) << "shutdown" << dendl;
+
+ lock.lock();
+
+ wait_for_paxos_write();
+
+ {
+ std::lock_guard l(auth_lock);
+ authmon()->_set_mon_num_rank(0, 0);
+ }
+
+ state = STATE_SHUTDOWN;
+
+ lock.unlock();
+ g_conf().remove_observer(this);
+ lock.lock();
+
+ if (admin_hook) {
+ cct->get_admin_socket()->unregister_commands(admin_hook);
+ delete admin_hook;
+ admin_hook = NULL;
+ }
+
+ elector.shutdown();
+
+ mgr_client.shutdown();
+
+ lock.unlock();
+ finisher.wait_for_empty();
+ finisher.stop();
+ lock.lock();
+
+ // clean up
+ paxos->shutdown();
+ for (auto& svc : paxos_service) {
+ svc->shutdown();
+ }
+
+ finish_contexts(g_ceph_context, waitfor_quorum, -ECANCELED);
+ finish_contexts(g_ceph_context, maybe_wait_for_quorum, -ECANCELED);
+
+ timer.shutdown();
+
+ cpu_tp.stop();
+
+ remove_all_sessions();
+
+ log_client.shutdown();
+
+ // unlock before msgr shutdown...
+ lock.unlock();
+
+ // shutdown messenger before removing logger from perfcounter collection,
+ // otherwise _ms_dispatch() will try to update deleted logger
+ messenger->shutdown();
+ mgr_messenger->shutdown();
+
+ if (logger) {
+ cct->get_perfcounters_collection()->remove(logger);
+ }
+ if (cluster_logger) {
+ if (cluster_logger_registered)
+ cct->get_perfcounters_collection()->remove(cluster_logger);
+ delete cluster_logger;
+ cluster_logger = NULL;
+ }
+}
+
+void Monitor::wait_for_paxos_write()
+{
+ if (paxos->is_writing() || paxos->is_writing_previous()) {
+ dout(10) << __func__ << " flushing pending write" << dendl;
+ lock.unlock();
+ store->flush();
+ lock.lock();
+ dout(10) << __func__ << " flushed pending write" << dendl;
+ }
+}
+
+void Monitor::respawn()
+{
+ // --- WARNING TO FUTURE COPY/PASTERS ---
+ // You must also add a call like
+ //
+ // ceph_pthread_setname(pthread_self(), "ceph-mon");
+ //
+ // to main() so that /proc/$pid/stat field 2 contains "(ceph-mon)"
+ // instead of "(exe)", so that killall (and log rotation) will work.
+
+ dout(0) << __func__ << dendl;
+
+ char *new_argv[orig_argc+1];
+ dout(1) << " e: '" << orig_argv[0] << "'" << dendl;
+ for (int i=0; i<orig_argc; i++) {
+ new_argv[i] = (char *)orig_argv[i];
+ dout(1) << " " << i << ": '" << orig_argv[i] << "'" << dendl;
+ }
+ new_argv[orig_argc] = NULL;
+
+ /* Determine the path to our executable, test if Linux /proc/self/exe exists.
+ * This allows us to exec the same executable even if it has since been
+ * unlinked.
+ */
+ char exe_path[PATH_MAX] = "";
+#ifdef PROCPREFIX
+ if (readlink(PROCPREFIX "/proc/self/exe", exe_path, PATH_MAX-1) != -1) {
+ dout(1) << "respawning with exe " << exe_path << dendl;
+ strcpy(exe_path, PROCPREFIX "/proc/self/exe");
+ } else {
+#else
+ {
+#endif
+ /* Print CWD for the user's interest */
+ char buf[PATH_MAX];
+ char *cwd = getcwd(buf, sizeof(buf));
+ ceph_assert(cwd);
+ dout(1) << " cwd " << cwd << dendl;
+
+ /* Fall back to a best-effort: just running in our CWD */
+ strncpy(exe_path, orig_argv[0], PATH_MAX-1);
+ }
+
+ dout(1) << " exe_path " << exe_path << dendl;
+
+ unblock_all_signals(NULL);
+ execv(exe_path, new_argv);
+
+ dout(0) << "respawn execv " << orig_argv[0]
+ << " failed with " << cpp_strerror(errno) << dendl;
+
+ // We have to assert out here, because suicide() returns, and callers
+ // to respawn expect it never to return.
+ ceph_abort();
+}
+
+void Monitor::bootstrap()
+{
+ dout(10) << "bootstrap" << dendl;
+ wait_for_paxos_write();
+
+ sync_reset_requester();
+ unregister_cluster_logger();
+ cancel_probe_timeout();
+
+ if (monmap->get_epoch() == 0) {
+ dout(10) << "reverting to legacy ranks for seed monmap (epoch 0)" << dendl;
+ monmap->calc_legacy_ranks();
+ }
+ dout(10) << "monmap " << *monmap << dendl;
+ {
+ auto from_release = monmap->min_mon_release;
+ ostringstream err;
+ if (!can_upgrade_from(from_release, "min_mon_release", err)) {
+ derr << "current monmap has " << err.str() << " stopping." << dendl;
+ exit(0);
+ }
+ }
+ // note my rank
+ int newrank = monmap->get_rank(messenger->get_myaddrs());
+ if (newrank < 0 && rank >= 0) {
+ // was i ever part of the quorum?
+ if (has_ever_joined) {
+ dout(0) << " removed from monmap, suicide." << dendl;
+ exit(0);
+ }
+ elector.notify_clear_peer_state();
+ }
+ if (newrank >= 0 &&
+ monmap->get_addrs(newrank) != messenger->get_myaddrs()) {
+ dout(0) << " monmap addrs for rank " << newrank << " changed, i am "
+ << messenger->get_myaddrs()
+ << ", monmap is " << monmap->get_addrs(newrank) << ", respawning"
+ << dendl;
+
+ if (monmap->get_epoch()) {
+ // store this map in temp mon_sync location so that we use it on
+ // our next startup
+ derr << " stashing newest monmap " << monmap->get_epoch()
+ << " for next startup" << dendl;
+ bufferlist bl;
+ monmap->encode(bl, -1);
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ t->put("mon_sync", "temp_newer_monmap", bl);
+ store->apply_transaction(t);
+ }
+
+ respawn();
+ }
+ if (newrank != rank) {
+ dout(0) << " my rank is now " << newrank << " (was " << rank << ")" << dendl;
+ messenger->set_myname(entity_name_t::MON(newrank));
+ rank = newrank;
+ elector.notify_rank_changed(rank);
+
+ // reset all connections, or else our peers will think we are someone else.
+ messenger->mark_down_all();
+ }
+
+ // reset
+ state = STATE_PROBING;
+
+ _reset();
+
+ // sync store
+ if (g_conf()->mon_compact_on_bootstrap) {
+ dout(10) << "bootstrap -- triggering compaction" << dendl;
+ store->compact();
+ dout(10) << "bootstrap -- finished compaction" << dendl;
+ }
+
+ // stretch mode bits
+ set_elector_disallowed_leaders(false);
+
+ // singleton monitor?
+ if (monmap->size() == 1 && rank == 0) {
+ win_standalone_election();
+ return;
+ }
+
+ reset_probe_timeout();
+
+ // i'm outside the quorum
+ if (monmap->contains(name))
+ outside_quorum.insert(name);
+
+ // probe monitors
+ dout(10) << "probing other monitors" << dendl;
+ for (unsigned i = 0; i < monmap->size(); i++) {
+ if ((int)i != rank)
+ send_mon_message(
+ new MMonProbe(monmap->fsid, MMonProbe::OP_PROBE, name, has_ever_joined,
+ ceph_release()),
+ i);
+ }
+ for (auto& av : extra_probe_peers) {
+ if (av != messenger->get_myaddrs()) {
+ messenger->send_to_mon(
+ new MMonProbe(monmap->fsid, MMonProbe::OP_PROBE, name, has_ever_joined,
+ ceph_release()),
+ av);
+ }
+ }
+}
+
+bool Monitor::_add_bootstrap_peer_hint(std::string_view cmd,
+ const cmdmap_t& cmdmap,
+ ostream& ss)
+{
+ if (is_leader() || is_peon()) {
+ ss << "mon already active; ignoring bootstrap hint";
+ return true;
+ }
+
+ entity_addrvec_t addrs;
+ string addrstr;
+ if (cmd_getval(cmdmap, "addr", addrstr)) {
+ dout(10) << "_add_bootstrap_peer_hint '" << cmd << "' addr '"
+ << addrstr << "'" << dendl;
+
+ entity_addr_t addr;
+ if (!addr.parse(addrstr, entity_addr_t::TYPE_ANY)) {
+ ss << "failed to parse addrs '" << addrstr
+ << "'; syntax is 'add_bootstrap_peer_hint ip[:port]'";
+ return false;
+ }
+
+ addrs.v.push_back(addr);
+ if (addr.get_port() == 0) {
+ addrs.v[0].set_type(entity_addr_t::TYPE_MSGR2);
+ addrs.v[0].set_port(CEPH_MON_PORT_IANA);
+ addrs.v.push_back(addr);
+ addrs.v[1].set_type(entity_addr_t::TYPE_LEGACY);
+ addrs.v[1].set_port(CEPH_MON_PORT_LEGACY);
+ } else if (addr.get_type() == entity_addr_t::TYPE_ANY) {
+ if (addr.get_port() == CEPH_MON_PORT_LEGACY) {
+ addrs.v[0].set_type(entity_addr_t::TYPE_LEGACY);
+ } else {
+ addrs.v[0].set_type(entity_addr_t::TYPE_MSGR2);
+ }
+ }
+ } else if (cmd_getval(cmdmap, "addrv", addrstr)) {
+ dout(10) << "_add_bootstrap_peer_hintv '" << cmd << "' addrv '"
+ << addrstr << "'" << dendl;
+ const char *end = 0;
+ if (!addrs.parse(addrstr.c_str(), &end)) {
+ ss << "failed to parse addrs '" << addrstr
+ << "'; syntax is 'add_bootstrap_peer_hintv v2:ip:port[,v1:ip:port]'";
+ return false;
+ }
+ } else {
+ ss << "no addr or addrv provided";
+ return false;
+ }
+
+ extra_probe_peers.insert(addrs);
+ ss << "adding peer " << addrs << " to list: " << extra_probe_peers;
+ return true;
+}
+
+// called by bootstrap(), or on leader|peon -> electing
+void Monitor::_reset()
+{
+ dout(10) << __func__ << dendl;
+
+ // disable authentication
+ {
+ std::lock_guard l(auth_lock);
+ authmon()->_set_mon_num_rank(0, 0);
+ }
+
+ cancel_probe_timeout();
+ timecheck_finish();
+ health_events_cleanup();
+ health_check_log_times.clear();
+ scrub_event_cancel();
+
+ leader_since = utime_t();
+ quorum_since = {};
+ if (!quorum.empty()) {
+ exited_quorum = ceph_clock_now();
+ }
+ quorum.clear();
+ outside_quorum.clear();
+ quorum_feature_map.clear();
+
+ scrub_reset();
+
+ paxos->restart();
+
+ for (auto& svc : paxos_service) {
+ svc->restart();
+ }
+}
+
+
+// -----------------------------------------------------------
+// sync
+
+set<string> Monitor::get_sync_targets_names()
+{
+ set<string> targets;
+ targets.insert(paxos->get_name());
+ for (auto& svc : paxos_service) {
+ svc->get_store_prefixes(targets);
+ }
+ return targets;
+}
+
+
+void Monitor::sync_timeout()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(state == STATE_SYNCHRONIZING);
+ bootstrap();
+}
+
+void Monitor::sync_obtain_latest_monmap(bufferlist &bl)
+{
+ dout(1) << __func__ << dendl;
+
+ MonMap latest_monmap;
+
+ // Grab latest monmap from MonmapMonitor
+ bufferlist monmon_bl;
+ int err = monmon()->get_monmap(monmon_bl);
+ if (err < 0) {
+ if (err != -ENOENT) {
+ derr << __func__
+ << " something wrong happened while reading the store: "
+ << cpp_strerror(err) << dendl;
+ ceph_abort_msg("error reading the store");
+ }
+ } else {
+ latest_monmap.decode(monmon_bl);
+ }
+
+ // Grab last backed up monmap (if any) and compare epochs
+ if (store->exists("mon_sync", "latest_monmap")) {
+ bufferlist backup_bl;
+ int err = store->get("mon_sync", "latest_monmap", backup_bl);
+ if (err < 0) {
+ derr << __func__
+ << " something wrong happened while reading the store: "
+ << cpp_strerror(err) << dendl;
+ ceph_abort_msg("error reading the store");
+ }
+ ceph_assert(backup_bl.length() > 0);
+
+ MonMap backup_monmap;
+ backup_monmap.decode(backup_bl);
+
+ if (backup_monmap.epoch > latest_monmap.epoch)
+ latest_monmap = backup_monmap;
+ }
+
+ // Check if our current monmap's epoch is greater than the one we've
+ // got so far.
+ if (monmap->epoch > latest_monmap.epoch)
+ latest_monmap = *monmap;
+
+ dout(1) << __func__ << " obtained monmap e" << latest_monmap.epoch << dendl;
+
+ latest_monmap.encode(bl, CEPH_FEATURES_ALL);
+}
+
+void Monitor::sync_reset_requester()
+{
+ dout(10) << __func__ << dendl;
+
+ if (sync_timeout_event) {
+ timer.cancel_event(sync_timeout_event);
+ sync_timeout_event = NULL;
+ }
+
+ sync_provider = entity_addrvec_t();
+ sync_cookie = 0;
+ sync_full = false;
+ sync_start_version = 0;
+}
+
+void Monitor::sync_reset_provider()
+{
+ dout(10) << __func__ << dendl;
+ sync_providers.clear();
+}
+
+void Monitor::sync_start(entity_addrvec_t &addrs, bool full)
+{
+ dout(10) << __func__ << " " << addrs << (full ? " full" : " recent") << dendl;
+
+ ceph_assert(state == STATE_PROBING ||
+ state == STATE_SYNCHRONIZING);
+ state = STATE_SYNCHRONIZING;
+
+ // make sure are not a provider for anyone!
+ sync_reset_provider();
+
+ sync_full = full;
+
+ if (sync_full) {
+ // stash key state, and mark that we are syncing
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ sync_stash_critical_state(t);
+ t->put("mon_sync", "in_sync", 1);
+
+ sync_last_committed_floor = std::max(sync_last_committed_floor, paxos->get_version());
+ dout(10) << __func__ << " marking sync in progress, storing sync_last_committed_floor "
+ << sync_last_committed_floor << dendl;
+ t->put("mon_sync", "last_committed_floor", sync_last_committed_floor);
+
+ store->apply_transaction(t);
+
+ ceph_assert(g_conf()->mon_sync_requester_kill_at != 1);
+
+ // clear the underlying store
+ set<string> targets = get_sync_targets_names();
+ dout(10) << __func__ << " clearing prefixes " << targets << dendl;
+ store->clear(targets);
+
+ // make sure paxos knows it has been reset. this prevents a
+ // bootstrap and then different probe reply order from possibly
+ // deciding a partial or no sync is needed.
+ paxos->init();
+
+ ceph_assert(g_conf()->mon_sync_requester_kill_at != 2);
+ }
+
+ // assume 'other' as the leader. We will update the leader once we receive
+ // a reply to the sync start.
+ sync_provider = addrs;
+
+ sync_reset_timeout();
+
+ MMonSync *m = new MMonSync(sync_full ? MMonSync::OP_GET_COOKIE_FULL : MMonSync::OP_GET_COOKIE_RECENT);
+ if (!sync_full)
+ m->last_committed = paxos->get_version();
+ messenger->send_to_mon(m, sync_provider);
+}
+
+void Monitor::sync_stash_critical_state(MonitorDBStore::TransactionRef t)
+{
+ dout(10) << __func__ << dendl;
+ bufferlist backup_monmap;
+ sync_obtain_latest_monmap(backup_monmap);
+ ceph_assert(backup_monmap.length() > 0);
+ t->put("mon_sync", "latest_monmap", backup_monmap);
+}
+
+void Monitor::sync_reset_timeout()
+{
+ dout(10) << __func__ << dendl;
+ if (sync_timeout_event)
+ timer.cancel_event(sync_timeout_event);
+ sync_timeout_event = timer.add_event_after(
+ g_conf()->mon_sync_timeout,
+ new C_MonContext{this, [this](int) {
+ sync_timeout();
+ }});
+}
+
+void Monitor::sync_finish(version_t last_committed)
+{
+ dout(10) << __func__ << " lc " << last_committed << " from " << sync_provider << dendl;
+
+ ceph_assert(g_conf()->mon_sync_requester_kill_at != 7);
+
+ if (sync_full) {
+ // finalize the paxos commits
+ auto tx(std::make_shared<MonitorDBStore::Transaction>());
+ paxos->read_and_prepare_transactions(tx, sync_start_version,
+ last_committed);
+ tx->put(paxos->get_name(), "last_committed", last_committed);
+
+ dout(30) << __func__ << " final tx dump:\n";
+ JSONFormatter f(true);
+ tx->dump(&f);
+ f.flush(*_dout);
+ *_dout << dendl;
+
+ store->apply_transaction(tx);
+ }
+
+ ceph_assert(g_conf()->mon_sync_requester_kill_at != 8);
+
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ t->erase("mon_sync", "in_sync");
+ t->erase("mon_sync", "force_sync");
+ t->erase("mon_sync", "last_committed_floor");
+ store->apply_transaction(t);
+
+ ceph_assert(g_conf()->mon_sync_requester_kill_at != 9);
+
+ init_paxos();
+
+ ceph_assert(g_conf()->mon_sync_requester_kill_at != 10);
+
+ bootstrap();
+}
+
+void Monitor::handle_sync(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonSync>();
+ dout(10) << __func__ << " " << *m << dendl;
+ switch (m->op) {
+
+ // provider ---------
+
+ case MMonSync::OP_GET_COOKIE_FULL:
+ case MMonSync::OP_GET_COOKIE_RECENT:
+ handle_sync_get_cookie(op);
+ break;
+ case MMonSync::OP_GET_CHUNK:
+ handle_sync_get_chunk(op);
+ break;
+
+ // client -----------
+
+ case MMonSync::OP_COOKIE:
+ handle_sync_cookie(op);
+ break;
+
+ case MMonSync::OP_CHUNK:
+ case MMonSync::OP_LAST_CHUNK:
+ handle_sync_chunk(op);
+ break;
+ case MMonSync::OP_NO_COOKIE:
+ handle_sync_no_cookie(op);
+ break;
+
+ default:
+ dout(0) << __func__ << " unknown op " << m->op << dendl;
+ ceph_abort_msg("unknown op");
+ }
+}
+
+// leader
+
+void Monitor::_sync_reply_no_cookie(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonSync>();
+ MMonSync *reply = new MMonSync(MMonSync::OP_NO_COOKIE, m->cookie);
+ m->get_connection()->send_message(reply);
+}
+
+void Monitor::handle_sync_get_cookie(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonSync>();
+ if (is_synchronizing()) {
+ _sync_reply_no_cookie(op);
+ return;
+ }
+
+ ceph_assert(g_conf()->mon_sync_provider_kill_at != 1);
+
+ // make sure they can understand us.
+ if ((required_features ^ m->get_connection()->get_features()) &
+ required_features) {
+ dout(5) << " ignoring peer mon." << m->get_source().num()
+ << " has features " << std::hex
+ << m->get_connection()->get_features()
+ << " but we require " << required_features << std::dec << dendl;
+ return;
+ }
+
+ // make up a unique cookie. include election epoch (which persists
+ // across restarts for the whole cluster) and a counter for this
+ // process instance. there is no need to be unique *across*
+ // monitors, though.
+ uint64_t cookie = ((unsigned long long)elector.get_epoch() << 24) + ++sync_provider_count;
+ ceph_assert(sync_providers.count(cookie) == 0);
+
+ dout(10) << __func__ << " cookie " << cookie << " for " << m->get_source_inst() << dendl;
+
+ SyncProvider& sp = sync_providers[cookie];
+ sp.cookie = cookie;
+ sp.addrs = m->get_source_addrs();
+ sp.reset_timeout(g_ceph_context, g_conf()->mon_sync_timeout * 2);
+
+ set<string> sync_targets;
+ if (m->op == MMonSync::OP_GET_COOKIE_FULL) {
+ // full scan
+ sync_targets = get_sync_targets_names();
+ sp.last_committed = paxos->get_version();
+ sp.synchronizer = store->get_synchronizer(sp.last_key, sync_targets);
+ sp.full = true;
+ dout(10) << __func__ << " will sync prefixes " << sync_targets << dendl;
+ } else {
+ // just catch up paxos
+ sp.last_committed = m->last_committed;
+ }
+ dout(10) << __func__ << " will sync from version " << sp.last_committed << dendl;
+
+ MMonSync *reply = new MMonSync(MMonSync::OP_COOKIE, sp.cookie);
+ reply->last_committed = sp.last_committed;
+ m->get_connection()->send_message(reply);
+}
+
+void Monitor::handle_sync_get_chunk(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonSync>();
+ dout(10) << __func__ << " " << *m << dendl;
+
+ if (sync_providers.count(m->cookie) == 0) {
+ dout(10) << __func__ << " no cookie " << m->cookie << dendl;
+ _sync_reply_no_cookie(op);
+ return;
+ }
+
+ ceph_assert(g_conf()->mon_sync_provider_kill_at != 2);
+
+ SyncProvider& sp = sync_providers[m->cookie];
+ sp.reset_timeout(g_ceph_context, g_conf()->mon_sync_timeout * 2);
+
+ if (sp.last_committed < paxos->get_first_committed() &&
+ paxos->get_first_committed() > 1) {
+ dout(10) << __func__ << " sync requester fell behind paxos, their lc " << sp.last_committed
+ << " < our fc " << paxos->get_first_committed() << dendl;
+ sync_providers.erase(m->cookie);
+ _sync_reply_no_cookie(op);
+ return;
+ }
+
+ MMonSync *reply = new MMonSync(MMonSync::OP_CHUNK, sp.cookie);
+ auto tx(std::make_shared<MonitorDBStore::Transaction>());
+
+ int bytes_left = g_conf()->mon_sync_max_payload_size;
+ int keys_left = g_conf()->mon_sync_max_payload_keys;
+ while (sp.last_committed < paxos->get_version() &&
+ bytes_left > 0 &&
+ keys_left > 0) {
+ bufferlist bl;
+ sp.last_committed++;
+
+ int err = store->get(paxos->get_name(), sp.last_committed, bl);
+ ceph_assert(err == 0);
+
+ tx->put(paxos->get_name(), sp.last_committed, bl);
+ bytes_left -= bl.length();
+ --keys_left;
+ dout(20) << __func__ << " including paxos state " << sp.last_committed
+ << dendl;
+ }
+ reply->last_committed = sp.last_committed;
+
+ if (sp.full && bytes_left > 0 && keys_left > 0) {
+ sp.synchronizer->get_chunk_tx(tx, bytes_left, keys_left);
+ sp.last_key = sp.synchronizer->get_last_key();
+ reply->last_key = sp.last_key;
+ }
+
+ if ((sp.full && sp.synchronizer->has_next_chunk()) ||
+ sp.last_committed < paxos->get_version()) {
+ dout(10) << __func__ << " chunk, through version " << sp.last_committed
+ << " key " << sp.last_key << dendl;
+ } else {
+ dout(10) << __func__ << " last chunk, through version " << sp.last_committed
+ << " key " << sp.last_key << dendl;
+ reply->op = MMonSync::OP_LAST_CHUNK;
+
+ ceph_assert(g_conf()->mon_sync_provider_kill_at != 3);
+
+ // clean up our local state
+ sync_providers.erase(sp.cookie);
+ }
+
+ encode(*tx, reply->chunk_bl);
+
+ m->get_connection()->send_message(reply);
+}
+
+// requester
+
+void Monitor::handle_sync_cookie(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonSync>();
+ dout(10) << __func__ << " " << *m << dendl;
+ if (sync_cookie) {
+ dout(10) << __func__ << " already have a cookie, ignoring" << dendl;
+ return;
+ }
+ if (m->get_source_addrs() != sync_provider) {
+ dout(10) << __func__ << " source does not match, discarding" << dendl;
+ return;
+ }
+ sync_cookie = m->cookie;
+ sync_start_version = m->last_committed;
+
+ sync_reset_timeout();
+ sync_get_next_chunk();
+
+ ceph_assert(g_conf()->mon_sync_requester_kill_at != 3);
+}
+
+void Monitor::sync_get_next_chunk()
+{
+ dout(20) << __func__ << " cookie " << sync_cookie << " provider " << sync_provider << dendl;
+ if (g_conf()->mon_inject_sync_get_chunk_delay > 0) {
+ dout(20) << __func__ << " injecting delay of " << g_conf()->mon_inject_sync_get_chunk_delay << dendl;
+ usleep((long long)(g_conf()->mon_inject_sync_get_chunk_delay * 1000000.0));
+ }
+ MMonSync *r = new MMonSync(MMonSync::OP_GET_CHUNK, sync_cookie);
+ messenger->send_to_mon(r, sync_provider);
+
+ ceph_assert(g_conf()->mon_sync_requester_kill_at != 4);
+}
+
+void Monitor::handle_sync_chunk(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonSync>();
+ dout(10) << __func__ << " " << *m << dendl;
+
+ if (m->cookie != sync_cookie) {
+ dout(10) << __func__ << " cookie does not match, discarding" << dendl;
+ return;
+ }
+ if (m->get_source_addrs() != sync_provider) {
+ dout(10) << __func__ << " source does not match, discarding" << dendl;
+ return;
+ }
+
+ ceph_assert(state == STATE_SYNCHRONIZING);
+ ceph_assert(g_conf()->mon_sync_requester_kill_at != 5);
+
+ auto tx(std::make_shared<MonitorDBStore::Transaction>());
+ tx->append_from_encoded(m->chunk_bl);
+
+ dout(30) << __func__ << " tx dump:\n";
+ JSONFormatter f(true);
+ tx->dump(&f);
+ f.flush(*_dout);
+ *_dout << dendl;
+
+ store->apply_transaction(tx);
+
+ ceph_assert(g_conf()->mon_sync_requester_kill_at != 6);
+
+ if (!sync_full) {
+ dout(10) << __func__ << " applying recent paxos transactions as we go" << dendl;
+ auto tx(std::make_shared<MonitorDBStore::Transaction>());
+ paxos->read_and_prepare_transactions(tx, paxos->get_version() + 1,
+ m->last_committed);
+ tx->put(paxos->get_name(), "last_committed", m->last_committed);
+
+ dout(30) << __func__ << " tx dump:\n";
+ JSONFormatter f(true);
+ tx->dump(&f);
+ f.flush(*_dout);
+ *_dout << dendl;
+
+ store->apply_transaction(tx);
+ paxos->init(); // to refresh what we just wrote
+ }
+
+ if (m->op == MMonSync::OP_CHUNK) {
+ sync_reset_timeout();
+ sync_get_next_chunk();
+ } else if (m->op == MMonSync::OP_LAST_CHUNK) {
+ sync_finish(m->last_committed);
+ }
+}
+
+void Monitor::handle_sync_no_cookie(MonOpRequestRef op)
+{
+ dout(10) << __func__ << dendl;
+ bootstrap();
+}
+
+void Monitor::sync_trim_providers()
+{
+ dout(20) << __func__ << dendl;
+
+ utime_t now = ceph_clock_now();
+ map<uint64_t,SyncProvider>::iterator p = sync_providers.begin();
+ while (p != sync_providers.end()) {
+ if (now > p->second.timeout) {
+ dout(10) << __func__ << " expiring cookie " << p->second.cookie
+ << " for " << p->second.addrs << dendl;
+ sync_providers.erase(p++);
+ } else {
+ ++p;
+ }
+ }
+}
+
+// ---------------------------------------------------
+// probe
+
+void Monitor::cancel_probe_timeout()
+{
+ if (probe_timeout_event) {
+ dout(10) << "cancel_probe_timeout " << probe_timeout_event << dendl;
+ timer.cancel_event(probe_timeout_event);
+ probe_timeout_event = NULL;
+ } else {
+ dout(10) << "cancel_probe_timeout (none scheduled)" << dendl;
+ }
+}
+
+void Monitor::reset_probe_timeout()
+{
+ cancel_probe_timeout();
+ probe_timeout_event = new C_MonContext{this, [this](int r) {
+ probe_timeout(r);
+ }};
+ double t = g_conf()->mon_probe_timeout;
+ if (timer.add_event_after(t, probe_timeout_event)) {
+ dout(10) << "reset_probe_timeout " << probe_timeout_event
+ << " after " << t << " seconds" << dendl;
+ } else {
+ probe_timeout_event = nullptr;
+ }
+}
+
+void Monitor::probe_timeout(int r)
+{
+ dout(4) << "probe_timeout " << probe_timeout_event << dendl;
+ ceph_assert(is_probing() || is_synchronizing());
+ ceph_assert(probe_timeout_event);
+ probe_timeout_event = NULL;
+ bootstrap();
+}
+
+void Monitor::handle_probe(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonProbe>();
+ dout(10) << "handle_probe " << *m << dendl;
+
+ if (m->fsid != monmap->fsid) {
+ dout(0) << "handle_probe ignoring fsid " << m->fsid << " != " << monmap->fsid << dendl;
+ return;
+ }
+
+ switch (m->op) {
+ case MMonProbe::OP_PROBE:
+ handle_probe_probe(op);
+ break;
+
+ case MMonProbe::OP_REPLY:
+ handle_probe_reply(op);
+ break;
+
+ case MMonProbe::OP_MISSING_FEATURES:
+ derr << __func__ << " require release " << (int)m->mon_release << " > "
+ << (int)ceph_release()
+ << ", or missing features (have " << CEPH_FEATURES_ALL
+ << ", required " << m->required_features
+ << ", missing " << (m->required_features & ~CEPH_FEATURES_ALL) << ")"
+ << dendl;
+ break;
+ }
+}
+
+void Monitor::handle_probe_probe(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonProbe>();
+
+ dout(10) << "handle_probe_probe " << m->get_source_inst() << " " << *m
+ << " features " << m->get_connection()->get_features() << dendl;
+ uint64_t missing = required_features & ~m->get_connection()->get_features();
+ if ((m->mon_release != ceph_release_t::unknown &&
+ m->mon_release < monmap->min_mon_release) ||
+ missing) {
+ dout(1) << " peer " << m->get_source_addr()
+ << " release " << m->mon_release
+ << " < min_mon_release " << monmap->min_mon_release
+ << ", or missing features " << missing << dendl;
+ MMonProbe *r = new MMonProbe(monmap->fsid, MMonProbe::OP_MISSING_FEATURES,
+ name, has_ever_joined, monmap->min_mon_release);
+ m->required_features = required_features;
+ m->get_connection()->send_message(r);
+ goto out;
+ }
+
+ if (!is_probing() && !is_synchronizing()) {
+ // If the probing mon is way ahead of us, we need to re-bootstrap.
+ // Normally we capture this case when we initially bootstrap, but
+ // it is possible we pass those checks (we overlap with
+ // quorum-to-be) but fail to join a quorum before it moves past
+ // us. We need to be kicked back to bootstrap so we can
+ // synchonize, not keep calling elections.
+ if (paxos->get_version() + 1 < m->paxos_first_version) {
+ dout(1) << " peer " << m->get_source_addr() << " has first_committed "
+ << "ahead of us, re-bootstrapping" << dendl;
+ bootstrap();
+ goto out;
+
+ }
+ }
+
+ MMonProbe *r;
+ r = new MMonProbe(monmap->fsid, MMonProbe::OP_REPLY, name, has_ever_joined,
+ ceph_release());
+ r->name = name;
+ r->quorum = quorum;
+ r->leader = leader;
+ monmap->encode(r->monmap_bl, m->get_connection()->get_features());
+ r->paxos_first_version = paxos->get_first_committed();
+ r->paxos_last_version = paxos->get_version();
+ m->get_connection()->send_message(r);
+
+ // did we discover a peer here?
+ if (!monmap->contains(m->get_source_addr())) {
+ dout(1) << " adding peer " << m->get_source_addrs()
+ << " to list of hints" << dendl;
+ extra_probe_peers.insert(m->get_source_addrs());
+ } else {
+ elector.begin_peer_ping(monmap->get_rank(m->get_source_addr()));
+ }
+
+ out:
+ return;
+}
+
+void Monitor::handle_probe_reply(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonProbe>();
+ dout(10) << "handle_probe_reply " << m->get_source_inst()
+ << " " << *m << dendl;
+ dout(10) << " monmap is " << *monmap << dendl;
+
+ // discover name and addrs during probing or electing states.
+ if (!is_probing() && !is_electing()) {
+ return;
+ }
+
+ // newer map, or they've joined a quorum and we haven't?
+ bufferlist mybl;
+ monmap->encode(mybl, m->get_connection()->get_features());
+ // make sure it's actually different; the checks below err toward
+ // taking the other guy's map, which could cause us to loop.
+ if (!mybl.contents_equal(m->monmap_bl)) {
+ MonMap *newmap = new MonMap;
+ newmap->decode(m->monmap_bl);
+ if (m->has_ever_joined && (newmap->get_epoch() > monmap->get_epoch() ||
+ !has_ever_joined)) {
+ dout(10) << " got newer/committed monmap epoch " << newmap->get_epoch()
+ << ", mine was " << monmap->get_epoch() << dendl;
+ int epoch_diff = newmap->get_epoch() - monmap->get_epoch();
+ dout(20) << " new monmap is " << *newmap << dendl;
+ delete newmap;
+ monmap->decode(m->monmap_bl);
+ dout(20) << "has_ever_joined: " << has_ever_joined << dendl;
+ if (epoch_diff == 1 && has_ever_joined) {
+ notify_new_monmap(false);
+ } else {
+ notify_new_monmap(false, false);
+ elector.notify_clear_peer_state();
+ }
+ bootstrap();
+ return;
+ }
+ delete newmap;
+ }
+
+ // rename peer?
+ string peer_name = monmap->get_name(m->get_source_addr());
+ if (monmap->get_epoch() == 0 && peer_name.compare(0, 7, "noname-") == 0) {
+ dout(10) << " renaming peer " << m->get_source_addr() << " "
+ << peer_name << " -> " << m->name << " in my monmap"
+ << dendl;
+ monmap->rename(peer_name, m->name);
+
+ if (is_electing()) {
+ bootstrap();
+ return;
+ }
+ } else if (peer_name.size()) {
+ dout(10) << " peer name is " << peer_name << dendl;
+ } else {
+ dout(10) << " peer " << m->get_source_addr() << " not in map" << dendl;
+ }
+
+ // new initial peer?
+ if (monmap->get_epoch() == 0 &&
+ monmap->contains(m->name) &&
+ monmap->get_addrs(m->name).front().is_blank_ip()) {
+ dout(1) << " learned initial mon " << m->name
+ << " addrs " << m->get_source_addrs() << dendl;
+ monmap->set_addrvec(m->name, m->get_source_addrs());
+
+ bootstrap();
+ return;
+ }
+
+ // end discover phase
+ if (!is_probing()) {
+ return;
+ }
+
+ ceph_assert(paxos != NULL);
+
+ if (is_synchronizing()) {
+ dout(10) << " currently syncing" << dendl;
+ return;
+ }
+
+ entity_addrvec_t other = m->get_source_addrs();
+
+ if (m->paxos_last_version < sync_last_committed_floor) {
+ dout(10) << " peer paxos versions [" << m->paxos_first_version
+ << "," << m->paxos_last_version << "] < my sync_last_committed_floor "
+ << sync_last_committed_floor << ", ignoring"
+ << dendl;
+ } else {
+ if (paxos->get_version() < m->paxos_first_version &&
+ m->paxos_first_version > 1) { // no need to sync if we're 0 and they start at 1.
+ dout(10) << " peer paxos first versions [" << m->paxos_first_version
+ << "," << m->paxos_last_version << "]"
+ << " vs my version " << paxos->get_version()
+ << " (too far ahead)"
+ << dendl;
+ cancel_probe_timeout();
+ sync_start(other, true);
+ return;
+ }
+ if (paxos->get_version() + g_conf()->paxos_max_join_drift < m->paxos_last_version) {
+ dout(10) << " peer paxos last version " << m->paxos_last_version
+ << " vs my version " << paxos->get_version()
+ << " (too far ahead)"
+ << dendl;
+ cancel_probe_timeout();
+ sync_start(other, false);
+ return;
+ }
+ }
+
+ // did the existing cluster complete upgrade to luminous?
+ if (osdmon()->osdmap.get_epoch()) {
+ if (osdmon()->osdmap.require_osd_release < ceph_release_t::luminous) {
+ derr << __func__ << " existing cluster has not completed upgrade to"
+ << " luminous; 'ceph osd require_osd_release luminous' before"
+ << " upgrading" << dendl;
+ exit(0);
+ }
+ if (!osdmon()->osdmap.test_flag(CEPH_OSDMAP_PURGED_SNAPDIRS) ||
+ !osdmon()->osdmap.test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
+ derr << __func__ << " existing cluster has not completed a full luminous"
+ << " scrub to purge legacy snapdir objects; please scrub before"
+ << " upgrading beyond luminous." << dendl;
+ exit(0);
+ }
+ }
+
+ // is there an existing quorum?
+ if (m->quorum.size()) {
+ dout(10) << " existing quorum " << m->quorum << dendl;
+
+ dout(10) << " peer paxos version " << m->paxos_last_version
+ << " vs my version " << paxos->get_version()
+ << " (ok)"
+ << dendl;
+ bool in_map = false;
+ const auto my_info = monmap->mon_info.find(name);
+ const map<string,string> *map_crush_loc{nullptr};
+ if (my_info != monmap->mon_info.end()) {
+ in_map = true;
+ map_crush_loc = &my_info->second.crush_loc;
+ }
+ if (in_map &&
+ !monmap->get_addrs(name).front().is_blank_ip() &&
+ (!need_set_crush_loc || (*map_crush_loc == crush_loc))) {
+ // i'm part of the cluster; just initiate a new election
+ start_election();
+ } else {
+ dout(10) << " ready to join, but i'm not in the monmap/"
+ "my addr is blank/location is wrong, trying to join" << dendl;
+ send_mon_message(new MMonJoin(monmap->fsid, name,
+ messenger->get_myaddrs(), crush_loc,
+ need_set_crush_loc),
+ m->leader);
+ }
+ } else {
+ if (monmap->contains(m->name)) {
+ dout(10) << " mon." << m->name << " is outside the quorum" << dendl;
+ outside_quorum.insert(m->name);
+ } else {
+ dout(10) << " mostly ignoring mon." << m->name << ", not part of monmap" << dendl;
+ return;
+ }
+
+ unsigned need = monmap->min_quorum_size();
+ dout(10) << " outside_quorum now " << outside_quorum << ", need " << need << dendl;
+ if (outside_quorum.size() >= need) {
+ if (outside_quorum.count(name)) {
+ dout(10) << " that's enough to form a new quorum, calling election" << dendl;
+ start_election();
+ } else {
+ dout(10) << " that's enough to form a new quorum, but it does not include me; waiting" << dendl;
+ }
+ } else {
+ dout(10) << " that's not yet enough for a new quorum, waiting" << dendl;
+ }
+ }
+}
+
+void Monitor::join_election()
+{
+ dout(10) << __func__ << dendl;
+ wait_for_paxos_write();
+ _reset();
+ state = STATE_ELECTING;
+
+ logger->inc(l_mon_num_elections);
+}
+
+void Monitor::start_election()
+{
+ dout(10) << "start_election" << dendl;
+ wait_for_paxos_write();
+ _reset();
+ state = STATE_ELECTING;
+
+ logger->inc(l_mon_num_elections);
+ logger->inc(l_mon_election_call);
+
+ clog->info() << "mon." << name << " calling monitor election";
+ elector.call_election();
+}
+
+void Monitor::win_standalone_election()
+{
+ dout(1) << "win_standalone_election" << dendl;
+
+ // bump election epoch, in case the previous epoch included other
+ // monitors; we need to be able to make the distinction.
+ elector.declare_standalone_victory();
+
+ rank = monmap->get_rank(name);
+ ceph_assert(rank == 0);
+ set<int> q;
+ q.insert(rank);
+
+ map<int,Metadata> metadata;
+ collect_metadata(&metadata[0]);
+
+ win_election(elector.get_epoch(), q,
+ CEPH_FEATURES_ALL,
+ ceph::features::mon::get_supported(),
+ ceph_release(),
+ metadata);
+}
+
+const utime_t& Monitor::get_leader_since() const
+{
+ ceph_assert(state == STATE_LEADER);
+ return leader_since;
+}
+
+epoch_t Monitor::get_epoch()
+{
+ return elector.get_epoch();
+}
+
+void Monitor::_finish_svc_election()
+{
+ ceph_assert(state == STATE_LEADER || state == STATE_PEON);
+
+ for (auto& svc : paxos_service) {
+ // we already called election_finished() on monmon(); avoid callig twice
+ if (state == STATE_LEADER && svc.get() == monmon())
+ continue;
+ svc->election_finished();
+ }
+}
+
+void Monitor::win_election(epoch_t epoch, const set<int>& active, uint64_t features,
+ const mon_feature_t& mon_features,
+ ceph_release_t min_mon_release,
+ const map<int,Metadata>& metadata)
+{
+ dout(10) << __func__ << " epoch " << epoch << " quorum " << active
+ << " features " << features
+ << " mon_features " << mon_features
+ << " min_mon_release " << min_mon_release
+ << dendl;
+ ceph_assert(is_electing());
+ state = STATE_LEADER;
+ leader_since = ceph_clock_now();
+ quorum_since = mono_clock::now();
+ leader = rank;
+ quorum = active;
+ quorum_con_features = features;
+ quorum_mon_features = mon_features;
+ quorum_min_mon_release = min_mon_release;
+ pending_metadata = metadata;
+ outside_quorum.clear();
+
+ clog->info() << "mon." << name << " is new leader, mons " << get_quorum_names()
+ << " in quorum (ranks " << quorum << ")";
+
+ set_leader_commands(get_local_commands(mon_features));
+
+ paxos->leader_init();
+ // NOTE: tell monmap monitor first. This is important for the
+ // bootstrap case to ensure that the very first paxos proposal
+ // codifies the monmap. Otherwise any manner of chaos can ensue
+ // when monitors are call elections or participating in a paxos
+ // round without agreeing on who the participants are.
+ monmon()->election_finished();
+ _finish_svc_election();
+
+ logger->inc(l_mon_election_win);
+
+ // inject new metadata in first transaction.
+ {
+ // include previous metadata for missing mons (that aren't part of
+ // the current quorum).
+ map<int,Metadata> m = metadata;
+ for (unsigned rank = 0; rank < monmap->size(); ++rank) {
+ if (m.count(rank) == 0 &&
+ mon_metadata.count(rank)) {
+ m[rank] = mon_metadata[rank];
+ }
+ }
+
+ // FIXME: This is a bit sloppy because we aren't guaranteed to submit
+ // a new transaction immediately after the election finishes. We should
+ // do that anyway for other reasons, though.
+ MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
+ bufferlist bl;
+ encode(m, bl);
+ t->put(MONITOR_STORE_PREFIX, "last_metadata", bl);
+ }
+
+ finish_election();
+ if (monmap->size() > 1 &&
+ monmap->get_epoch() > 0) {
+ timecheck_start();
+ health_tick_start();
+
+ // Freshen the health status before doing health_to_clog in case
+ // our just-completed election changed the health
+ healthmon()->wait_for_active_ctx(new LambdaContext([this](int r){
+ dout(20) << "healthmon now active" << dendl;
+ healthmon()->tick();
+ if (healthmon()->is_proposing()) {
+ dout(20) << __func__ << " healthmon proposing, waiting" << dendl;
+ healthmon()->wait_for_finished_proposal(nullptr, new C_MonContext{this,
+ [this](int r){
+ ceph_assert(ceph_mutex_is_locked_by_me(lock));
+ do_health_to_clog_interval();
+ }});
+
+ } else {
+ do_health_to_clog_interval();
+ }
+ }));
+
+ scrub_event_start();
+ }
+}
+
+void Monitor::lose_election(epoch_t epoch, set<int> &q, int l,
+ uint64_t features,
+ const mon_feature_t& mon_features,
+ ceph_release_t min_mon_release)
+{
+ state = STATE_PEON;
+ leader_since = utime_t();
+ quorum_since = mono_clock::now();
+ leader = l;
+ quorum = q;
+ outside_quorum.clear();
+ quorum_con_features = features;
+ quorum_mon_features = mon_features;
+ quorum_min_mon_release = min_mon_release;
+ dout(10) << "lose_election, epoch " << epoch << " leader is mon" << leader
+ << " quorum is " << quorum << " features are " << quorum_con_features
+ << " mon_features are " << quorum_mon_features
+ << " min_mon_release " << min_mon_release
+ << dendl;
+
+ paxos->peon_init();
+ _finish_svc_election();
+
+ logger->inc(l_mon_election_lose);
+
+ finish_election();
+}
+
+namespace {
+std::string collect_compression_algorithms()
+{
+ ostringstream os;
+ bool printed = false;
+ for (auto [name, key] : Compressor::compression_algorithms) {
+ if (printed) {
+ os << ", ";
+ } else {
+ printed = true;
+ }
+ std::ignore = key;
+ os << name;
+ }
+ return os.str();
+}
+}
+
+void Monitor::collect_metadata(Metadata *m)
+{
+ collect_sys_info(m, g_ceph_context);
+ (*m)["addrs"] = stringify(messenger->get_myaddrs());
+ (*m)["compression_algorithms"] = collect_compression_algorithms();
+
+ // infer storage device
+ string devname = store->get_devname();
+ set<string> devnames;
+ get_raw_devices(devname, &devnames);
+ map<string,string> errs;
+ get_device_metadata(devnames, m, &errs);
+ for (auto& i : errs) {
+ dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
+ }
+}
+
+void Monitor::finish_election()
+{
+ apply_quorum_to_compatset_features();
+ apply_monmap_to_compatset_features();
+ timecheck_finish();
+ exited_quorum = utime_t();
+ finish_contexts(g_ceph_context, waitfor_quorum);
+ finish_contexts(g_ceph_context, maybe_wait_for_quorum);
+ resend_routed_requests();
+ update_logger();
+ register_cluster_logger();
+
+ // enable authentication
+ {
+ std::lock_guard l(auth_lock);
+ authmon()->_set_mon_num_rank(monmap->size(), rank);
+ }
+
+ // am i named and located properly?
+ string cur_name = monmap->get_name(messenger->get_myaddrs());
+ const auto my_infop = monmap->mon_info.find(cur_name);
+ const map<string,string>& map_crush_loc = my_infop->second.crush_loc;
+
+ if (cur_name != name ||
+ (need_set_crush_loc && map_crush_loc != crush_loc)) {
+ dout(10) << " renaming/moving myself from " << cur_name << "/"
+ << map_crush_loc <<" -> " << name << "/" << crush_loc << dendl;
+ send_mon_message(new MMonJoin(monmap->fsid, name, messenger->get_myaddrs(),
+ crush_loc, need_set_crush_loc),
+ leader);
+ return;
+ }
+ do_stretch_mode_election_work();
+}
+
+void Monitor::_apply_compatset_features(CompatSet &new_features)
+{
+ if (new_features.compare(features) != 0) {
+ CompatSet diff = features.unsupported(new_features);
+ dout(1) << __func__ << " enabling new quorum features: " << diff << dendl;
+ features = new_features;
+
+ auto t = std::make_shared<MonitorDBStore::Transaction>();
+ write_features(t);
+ store->apply_transaction(t);
+
+ calc_quorum_requirements();
+ }
+}
+
+void Monitor::apply_quorum_to_compatset_features()
+{
+ CompatSet new_features(features);
+ new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES);
+ if (quorum_con_features & CEPH_FEATURE_OSDMAP_ENC) {
+ new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC);
+ }
+ new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2);
+ new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3);
+ dout(5) << __func__ << dendl;
+ _apply_compatset_features(new_features);
+}
+
+void Monitor::apply_monmap_to_compatset_features()
+{
+ CompatSet new_features(features);
+ mon_feature_t monmap_features = monmap->get_required_features();
+
+ /* persistent monmap features may go into the compatset.
+ * optional monmap features may not - why?
+ * because optional monmap features may be set/unset by the admin,
+ * and possibly by other means that haven't yet been thought out,
+ * so we can't make the monitor enforce them on start - because they
+ * may go away.
+ * this, of course, does not invalidate setting a compatset feature
+ * for an optional feature - as long as you make sure to clean it up
+ * once you unset it.
+ */
+ if (monmap_features.contains_all(ceph::features::mon::FEATURE_KRAKEN)) {
+ ceph_assert(ceph::features::mon::get_persistent().contains_all(
+ ceph::features::mon::FEATURE_KRAKEN));
+ // this feature should only ever be set if the quorum supports it.
+ ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_KRAKEN));
+ new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_KRAKEN);
+ }
+ if (monmap_features.contains_all(ceph::features::mon::FEATURE_LUMINOUS)) {
+ ceph_assert(ceph::features::mon::get_persistent().contains_all(
+ ceph::features::mon::FEATURE_LUMINOUS));
+ // this feature should only ever be set if the quorum supports it.
+ ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_LUMINOUS));
+ new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_LUMINOUS);
+ }
+ if (monmap_features.contains_all(ceph::features::mon::FEATURE_MIMIC)) {
+ ceph_assert(ceph::features::mon::get_persistent().contains_all(
+ ceph::features::mon::FEATURE_MIMIC));
+ // this feature should only ever be set if the quorum supports it.
+ ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_MIMIC));
+ new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_MIMIC);
+ }
+ if (monmap_features.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) {
+ ceph_assert(ceph::features::mon::get_persistent().contains_all(
+ ceph::features::mon::FEATURE_NAUTILUS));
+ // this feature should only ever be set if the quorum supports it.
+ ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_NAUTILUS));
+ new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_NAUTILUS);
+ }
+ if (monmap_features.contains_all(ceph::features::mon::FEATURE_OCTOPUS)) {
+ ceph_assert(ceph::features::mon::get_persistent().contains_all(
+ ceph::features::mon::FEATURE_OCTOPUS));
+ // this feature should only ever be set if the quorum supports it.
+ ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_OCTOPUS));
+ new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OCTOPUS);
+ }
+ if (monmap_features.contains_all(ceph::features::mon::FEATURE_PACIFIC)) {
+ ceph_assert(ceph::features::mon::get_persistent().contains_all(
+ ceph::features::mon::FEATURE_PACIFIC));
+ // this feature should only ever be set if the quorum supports it.
+ ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_PACIFIC));
+ new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_PACIFIC);
+ }
+ if (monmap_features.contains_all(ceph::features::mon::FEATURE_QUINCY)) {
+ ceph_assert(ceph::features::mon::get_persistent().contains_all(
+ ceph::features::mon::FEATURE_QUINCY));
+ // this feature should only ever be set if the quorum supports it.
+ ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_QUINCY));
+ new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_QUINCY);
+ }
+ if (monmap_features.contains_all(ceph::features::mon::FEATURE_REEF)) {
+ ceph_assert(ceph::features::mon::get_persistent().contains_all(
+ ceph::features::mon::FEATURE_REEF));
+ // this feature should only ever be set if the quorum supports it.
+ ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_REEF));
+ new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_REEF);
+ }
+ if (monmap_features.contains_all(ceph::features::mon::FEATURE_SQUID)) {
+ ceph_assert(ceph::features::mon::get_persistent().contains_all(
+ ceph::features::mon::FEATURE_SQUID));
+ // this feature should only ever be set if the quorum supports it.
+ ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_SQUID));
+ new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_SQUID);
+ }
+
+ dout(5) << __func__ << dendl;
+ _apply_compatset_features(new_features);
+}
+
+void Monitor::calc_quorum_requirements()
+{
+ required_features = 0;
+
+ // compatset
+ if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC)) {
+ required_features |= CEPH_FEATURE_OSDMAP_ENC;
+ }
+ if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_KRAKEN)) {
+ required_features |= CEPH_FEATUREMASK_SERVER_KRAKEN;
+ }
+ if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_LUMINOUS)) {
+ required_features |= CEPH_FEATUREMASK_SERVER_LUMINOUS;
+ }
+ if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_MIMIC)) {
+ required_features |= CEPH_FEATUREMASK_SERVER_MIMIC;
+ }
+ if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_NAUTILUS)) {
+ required_features |= CEPH_FEATUREMASK_SERVER_NAUTILUS |
+ CEPH_FEATUREMASK_CEPHX_V2;
+ }
+ if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_OCTOPUS)) {
+ required_features |= CEPH_FEATUREMASK_SERVER_OCTOPUS;
+ }
+ if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_PACIFIC)) {
+ required_features |= CEPH_FEATUREMASK_SERVER_PACIFIC;
+ }
+ if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_QUINCY)) {
+ required_features |= CEPH_FEATUREMASK_SERVER_QUINCY;
+ }
+ if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_REEF)) {
+ required_features |= CEPH_FEATUREMASK_SERVER_REEF;
+ }
+ if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_SQUID)) {
+ required_features |= CEPH_FEATUREMASK_SERVER_SQUID;
+ }
+
+ // monmap
+ if (monmap->get_required_features().contains_all(
+ ceph::features::mon::FEATURE_KRAKEN)) {
+ required_features |= CEPH_FEATUREMASK_SERVER_KRAKEN;
+ }
+ if (monmap->get_required_features().contains_all(
+ ceph::features::mon::FEATURE_LUMINOUS)) {
+ required_features |= CEPH_FEATUREMASK_SERVER_LUMINOUS;
+ }
+ if (monmap->get_required_features().contains_all(
+ ceph::features::mon::FEATURE_MIMIC)) {
+ required_features |= CEPH_FEATUREMASK_SERVER_MIMIC;
+ }
+ if (monmap->get_required_features().contains_all(
+ ceph::features::mon::FEATURE_NAUTILUS)) {
+ required_features |= CEPH_FEATUREMASK_SERVER_NAUTILUS |
+ CEPH_FEATUREMASK_CEPHX_V2;
+ }
+ dout(10) << __func__ << " required_features " << required_features << dendl;
+}
+
+void Monitor::get_combined_feature_map(FeatureMap *fm)
+{
+ *fm += session_map.feature_map;
+ for (auto id : quorum) {
+ if (id != rank) {
+ *fm += quorum_feature_map[id];
+ }
+ }
+}
+
+void Monitor::sync_force(Formatter *f)
+{
+ auto tx(std::make_shared<MonitorDBStore::Transaction>());
+ sync_stash_critical_state(tx);
+ tx->put("mon_sync", "force_sync", 1);
+ store->apply_transaction(tx);
+
+ f->open_object_section("sync_force");
+ f->dump_int("ret", 0);
+ f->dump_stream("msg") << "forcing store sync the next time the monitor starts";
+ f->close_section(); // sync_force
+}
+
+void Monitor::_quorum_status(Formatter *f, ostream& ss)
+{
+ bool free_formatter = false;
+
+ if (!f) {
+ // louzy/lazy hack: default to json if no formatter has been defined
+ f = new JSONFormatter();
+ free_formatter = true;
+ }
+ f->open_object_section("quorum_status");
+ f->dump_int("election_epoch", get_epoch());
+
+ f->open_array_section("quorum");
+ for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p)
+ f->dump_int("mon", *p);
+ f->close_section(); // quorum
+
+ list<string> quorum_names = get_quorum_names();
+ f->open_array_section("quorum_names");
+ for (list<string>::iterator p = quorum_names.begin(); p != quorum_names.end(); ++p)
+ f->dump_string("mon", *p);
+ f->close_section(); // quorum_names
+
+ f->dump_string("quorum_leader_name", quorum.empty() ? string() : monmap->get_name(leader));
+
+ if (!quorum.empty()) {
+ f->dump_int(
+ "quorum_age",
+ quorum_age());
+ }
+
+ f->open_object_section("features");
+ f->dump_stream("quorum_con") << quorum_con_features;
+ quorum_mon_features.dump(f, "quorum_mon");
+ f->close_section();
+
+ f->open_object_section("monmap");
+ monmap->dump(f);
+ f->close_section(); // monmap
+
+ f->close_section(); // quorum_status
+ f->flush(ss);
+ if (free_formatter)
+ delete f;
+}
+
+void Monitor::get_mon_status(Formatter *f)
+{
+ f->open_object_section("mon_status");
+ f->dump_string("name", name);
+ f->dump_int("rank", rank);
+ f->dump_string("state", get_state_name());
+ f->dump_int("election_epoch", get_epoch());
+ f->dump_int("uptime", get_uptime().count());
+
+ f->open_array_section("quorum");
+ for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p) {
+ f->dump_int("mon", *p);
+ }
+ f->close_section(); // quorum
+
+ if (!quorum.empty()) {
+ f->dump_int(
+ "quorum_age",
+ quorum_age());
+ }
+
+ f->open_object_section("features");
+ f->dump_stream("required_con") << required_features;
+ mon_feature_t req_mon_features = get_required_mon_features();
+ req_mon_features.dump(f, "required_mon");
+ f->dump_stream("quorum_con") << quorum_con_features;
+ quorum_mon_features.dump(f, "quorum_mon");
+ f->close_section(); // features
+
+ f->open_array_section("outside_quorum");
+ for (set<string>::iterator p = outside_quorum.begin(); p != outside_quorum.end(); ++p)
+ f->dump_string("mon", *p);
+ f->close_section(); // outside_quorum
+
+ f->open_array_section("extra_probe_peers");
+ for (set<entity_addrvec_t>::iterator p = extra_probe_peers.begin();
+ p != extra_probe_peers.end();
+ ++p) {
+ f->dump_object("peer", *p);
+ }
+ f->close_section(); // extra_probe_peers
+
+ f->open_array_section("sync_provider");
+ for (map<uint64_t,SyncProvider>::const_iterator p = sync_providers.begin();
+ p != sync_providers.end();
+ ++p) {
+ f->dump_unsigned("cookie", p->second.cookie);
+ f->dump_object("addrs", p->second.addrs);
+ f->dump_stream("timeout") << p->second.timeout;
+ f->dump_unsigned("last_committed", p->second.last_committed);
+ f->dump_stream("last_key") << p->second.last_key;
+ }
+ f->close_section();
+
+ if (is_synchronizing()) {
+ f->open_object_section("sync");
+ f->dump_stream("sync_provider") << sync_provider;
+ f->dump_unsigned("sync_cookie", sync_cookie);
+ f->dump_unsigned("sync_start_version", sync_start_version);
+ f->close_section();
+ }
+
+ if (g_conf()->mon_sync_provider_kill_at > 0)
+ f->dump_int("provider_kill_at", g_conf()->mon_sync_provider_kill_at);
+ if (g_conf()->mon_sync_requester_kill_at > 0)
+ f->dump_int("requester_kill_at", g_conf()->mon_sync_requester_kill_at);
+
+ f->open_object_section("monmap");
+ monmap->dump(f);
+ f->close_section();
+
+ f->dump_object("feature_map", session_map.feature_map);
+ f->dump_bool("stretch_mode", stretch_mode_engaged);
+ f->close_section(); // mon_status
+}
+
+
+// health status to clog
+
+void Monitor::health_tick_start()
+{
+ if (!cct->_conf->mon_health_to_clog ||
+ cct->_conf->mon_health_to_clog_tick_interval <= 0)
+ return;
+
+ dout(15) << __func__ << dendl;
+
+ health_tick_stop();
+ health_tick_event = timer.add_event_after(
+ cct->_conf->mon_health_to_clog_tick_interval,
+ new C_MonContext{this, [this](int r) {
+ if (r < 0)
+ return;
+ health_tick_start();
+ }});
+}
+
+void Monitor::health_tick_stop()
+{
+ dout(15) << __func__ << dendl;
+
+ if (health_tick_event) {
+ timer.cancel_event(health_tick_event);
+ health_tick_event = NULL;
+ }
+}
+
+ceph::real_clock::time_point Monitor::health_interval_calc_next_update()
+{
+ auto now = ceph::real_clock::now();
+
+ auto secs = std::chrono::duration_cast<std::chrono::seconds>(now.time_since_epoch());
+ int remainder = secs.count() % cct->_conf->mon_health_to_clog_interval;
+ int adjustment = cct->_conf->mon_health_to_clog_interval - remainder;
+ auto next = secs + std::chrono::seconds(adjustment);
+
+ dout(20) << __func__
+ << " now: " << now << ","
+ << " next: " << next << ","
+ << " interval: " << cct->_conf->mon_health_to_clog_interval
+ << dendl;
+
+ return ceph::real_clock::time_point{next};
+}
+
+void Monitor::health_interval_start()
+{
+ dout(15) << __func__ << dendl;
+
+ if (!cct->_conf->mon_health_to_clog ||
+ cct->_conf->mon_health_to_clog_interval <= 0) {
+ return;
+ }
+
+ health_interval_stop();
+ auto next = health_interval_calc_next_update();
+ health_interval_event = new C_MonContext{this, [this](int r) {
+ if (r < 0)
+ return;
+ do_health_to_clog_interval();
+ }};
+ if (!timer.add_event_at(next, health_interval_event)) {
+ health_interval_event = nullptr;
+ }
+}
+
+void Monitor::health_interval_stop()
+{
+ dout(15) << __func__ << dendl;
+ if (health_interval_event) {
+ timer.cancel_event(health_interval_event);
+ }
+ health_interval_event = NULL;
+}
+
+void Monitor::health_events_cleanup()
+{
+ health_tick_stop();
+ health_interval_stop();
+ health_status_cache.reset();
+}
+
+void Monitor::health_to_clog_update_conf(const std::set<std::string> &changed)
+{
+ dout(20) << __func__ << dendl;
+
+ if (changed.count("mon_health_to_clog")) {
+ if (!cct->_conf->mon_health_to_clog) {
+ health_events_cleanup();
+ return;
+ } else {
+ if (!health_tick_event) {
+ health_tick_start();
+ }
+ if (!health_interval_event) {
+ health_interval_start();
+ }
+ }
+ }
+
+ if (changed.count("mon_health_to_clog_interval")) {
+ if (cct->_conf->mon_health_to_clog_interval <= 0) {
+ health_interval_stop();
+ } else {
+ health_interval_start();
+ }
+ }
+
+ if (changed.count("mon_health_to_clog_tick_interval")) {
+ if (cct->_conf->mon_health_to_clog_tick_interval <= 0) {
+ health_tick_stop();
+ } else {
+ health_tick_start();
+ }
+ }
+}
+
+void Monitor::do_health_to_clog_interval()
+{
+ // outputting to clog may have been disabled in the conf
+ // since we were scheduled.
+ if (!cct->_conf->mon_health_to_clog ||
+ cct->_conf->mon_health_to_clog_interval <= 0)
+ return;
+
+ dout(10) << __func__ << dendl;
+
+ // do we have a cached value for next_clog_update? if not,
+ // do we know when the last update was?
+
+ do_health_to_clog(true);
+ health_interval_start();
+}
+
+void Monitor::do_health_to_clog(bool force)
+{
+ // outputting to clog may have been disabled in the conf
+ // since we were scheduled.
+ if (!cct->_conf->mon_health_to_clog ||
+ cct->_conf->mon_health_to_clog_interval <= 0)
+ return;
+
+ dout(10) << __func__ << (force ? " (force)" : "") << dendl;
+
+ string summary;
+ health_status_t level = healthmon()->get_health_status(false, nullptr, &summary);
+ if (!force &&
+ summary == health_status_cache.summary &&
+ level == health_status_cache.overall)
+ return;
+
+ if (g_conf()->mon_health_detail_to_clog &&
+ summary != health_status_cache.summary &&
+ level != HEALTH_OK) {
+ string details;
+ level = healthmon()->get_health_status(true, nullptr, &details);
+ clog->health(level) << "Health detail: " << details;
+ } else {
+ clog->health(level) << "overall " << summary;
+ }
+ health_status_cache.summary = summary;
+ health_status_cache.overall = level;
+}
+
+void Monitor::log_health(
+ const health_check_map_t& updated,
+ const health_check_map_t& previous,
+ MonitorDBStore::TransactionRef t)
+{
+ if (!g_conf()->mon_health_to_clog) {
+ return;
+ }
+
+ const utime_t now = ceph_clock_now();
+
+ // FIXME: log atomically as part of @t instead of using clog.
+ dout(10) << __func__ << " updated " << updated.checks.size()
+ << " previous " << previous.checks.size()
+ << dendl;
+ const auto min_log_period = g_conf().get_val<int64_t>(
+ "mon_health_log_update_period");
+ for (auto& p : updated.checks) {
+ auto q = previous.checks.find(p.first);
+ bool logged = false;
+ if (q == previous.checks.end()) {
+ // new
+ ostringstream ss;
+ ss << "Health check failed: " << p.second.summary << " ("
+ << p.first << ")";
+ clog->health(p.second.severity) << ss.str();
+
+ logged = true;
+ } else {
+ if (p.second.summary != q->second.summary ||
+ p.second.severity != q->second.severity) {
+
+ auto status_iter = health_check_log_times.find(p.first);
+ if (status_iter != health_check_log_times.end()) {
+ if (p.second.severity == q->second.severity &&
+ now - status_iter->second.updated_at < min_log_period) {
+ // We already logged this recently and the severity is unchanged,
+ // so skip emitting an update of the summary string.
+ // We'll get an update out of tick() later if the check
+ // is still failing.
+ continue;
+ }
+ }
+
+ // summary or severity changed (ignore detail changes at this level)
+ ostringstream ss;
+ ss << "Health check update: " << p.second.summary << " (" << p.first << ")";
+ clog->health(p.second.severity) << ss.str();
+
+ logged = true;
+ }
+ }
+ // Record the time at which we last logged, so that we can check this
+ // when considering whether/when to print update messages.
+ if (logged) {
+ auto iter = health_check_log_times.find(p.first);
+ if (iter == health_check_log_times.end()) {
+ health_check_log_times.emplace(p.first, HealthCheckLogStatus(
+ p.second.severity, p.second.summary, now));
+ } else {
+ iter->second = HealthCheckLogStatus(
+ p.second.severity, p.second.summary, now);
+ }
+ }
+ }
+ for (auto& p : previous.checks) {
+ if (!updated.checks.count(p.first)) {
+ // cleared
+ ostringstream ss;
+ if (p.first == "DEGRADED_OBJECTS") {
+ clog->info() << "All degraded objects recovered";
+ } else if (p.first == "OSD_FLAGS") {
+ clog->info() << "OSD flags cleared";
+ } else {
+ clog->info() << "Health check cleared: " << p.first << " (was: "
+ << p.second.summary << ")";
+ }
+
+ if (health_check_log_times.count(p.first)) {
+ health_check_log_times.erase(p.first);
+ }
+ }
+ }
+
+ if (previous.checks.size() && updated.checks.size() == 0) {
+ // We might be going into a fully healthy state, check
+ // other subsystems
+ bool any_checks = false;
+ for (auto& svc : paxos_service) {
+ if (&(svc->get_health_checks()) == &(previous)) {
+ // Ignore the ones we're clearing right now
+ continue;
+ }
+
+ if (svc->get_health_checks().checks.size() > 0) {
+ any_checks = true;
+ break;
+ }
+ }
+ if (!any_checks) {
+ clog->info() << "Cluster is now healthy";
+ }
+ }
+}
+
+void Monitor::update_pending_metadata()
+{
+ Metadata metadata;
+ collect_metadata(&metadata);
+ size_t version_size = mon_metadata[rank]["ceph_version_short"].size();
+ const std::string current_version = mon_metadata[rank]["ceph_version_short"];
+ const std::string pending_version = metadata["ceph_version_short"];
+
+ if (current_version.compare(0, version_size, pending_version) != 0) {
+ mgr_client.update_daemon_metadata("mon", name, metadata);
+ }
+}
+
+void Monitor::get_cluster_status(stringstream &ss, Formatter *f,
+ MonSession *session)
+{
+ if (f)
+ f->open_object_section("status");
+
+ const auto&& fs_names = session->get_allowed_fs_names();
+
+ if (f) {
+ f->dump_stream("fsid") << monmap->get_fsid();
+ healthmon()->get_health_status(false, f, nullptr);
+ f->dump_unsigned("election_epoch", get_epoch());
+ {
+ f->open_array_section("quorum");
+ for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p)
+ f->dump_int("rank", *p);
+ f->close_section();
+ f->open_array_section("quorum_names");
+ for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p)
+ f->dump_string("id", monmap->get_name(*p));
+ f->close_section();
+ f->dump_int(
+ "quorum_age",
+ quorum_age());
+ }
+ f->open_object_section("monmap");
+ monmap->dump_summary(f);
+ f->close_section();
+ f->open_object_section("osdmap");
+ osdmon()->osdmap.print_summary(f, cout, string(12, ' '));
+ f->close_section();
+ f->open_object_section("pgmap");
+ mgrstatmon()->print_summary(f, NULL);
+ f->close_section();
+ f->open_object_section("fsmap");
+
+ FSMap fsmap_copy = mdsmon()->get_fsmap();
+ if (!fs_names.empty()) {
+ fsmap_copy.filter(fs_names);
+ }
+ const FSMap *fsmapp = &fsmap_copy;
+
+ fsmapp->print_summary(f, NULL);
+ f->close_section();
+ f->open_object_section("mgrmap");
+ mgrmon()->get_map().print_summary(f, nullptr);
+ f->close_section();
+
+ f->dump_object("servicemap", mgrstatmon()->get_service_map());
+
+ f->open_object_section("progress_events");
+ for (auto& i : mgrstatmon()->get_progress_events()) {
+ f->dump_object(i.first.c_str(), i.second);
+ }
+ f->close_section();
+
+ f->close_section();
+ } else {
+ ss << " cluster:\n";
+ ss << " id: " << monmap->get_fsid() << "\n";
+
+ string health;
+ healthmon()->get_health_status(false, nullptr, &health,
+ "\n ", "\n ");
+ ss << " health: " << health << "\n";
+
+ ss << "\n \n services:\n";
+ {
+ size_t maxlen = 3;
+ auto& service_map = mgrstatmon()->get_service_map();
+ for (auto& p : service_map.services) {
+ maxlen = std::max(maxlen, p.first.size());
+ }
+ string spacing(maxlen - 3, ' ');
+ const auto quorum_names = get_quorum_names();
+ const auto mon_count = monmap->mon_info.size();
+ auto mnow = ceph::mono_clock::now();
+ ss << " mon: " << spacing << mon_count << " daemons, quorum "
+ << quorum_names << " (age " << timespan_str(mnow - quorum_since) << ")";
+ if (quorum_names.size() != mon_count) {
+ std::list<std::string> out_of_q;
+ for (size_t i = 0; i < monmap->ranks.size(); ++i) {
+ if (quorum.count(i) == 0) {
+ out_of_q.push_back(monmap->ranks[i]);
+ }
+ }
+ ss << ", out of quorum: " << joinify(out_of_q.begin(),
+ out_of_q.end(), std::string(", "));
+ }
+ ss << "\n";
+ if (mgrmon()->in_use()) {
+ ss << " mgr: " << spacing;
+ mgrmon()->get_map().print_summary(nullptr, &ss);
+ ss << "\n";
+ }
+
+ FSMap fsmap_copy = mdsmon()->get_fsmap();
+ if (!fs_names.empty()) {
+ fsmap_copy.filter(fs_names);
+ }
+ const FSMap *fsmapp = &fsmap_copy;
+
+ if (fsmapp->filesystem_count() > 0 and mdsmon()->should_print_status()){
+ ss << " mds: " << spacing;
+ fsmapp->print_daemon_summary(ss);
+ ss << "\n";
+ }
+
+ ss << " osd: " << spacing;
+ osdmon()->osdmap.print_summary(NULL, ss, string(maxlen + 6, ' '));
+ ss << "\n";
+ for (auto& p : service_map.services) {
+ const std::string &service = p.first;
+ // filter out normal ceph entity types
+ if (ServiceMap::is_normal_ceph_entity(service)) {
+ continue;
+ }
+ ss << " " << p.first << ": " << string(maxlen - p.first.size(), ' ')
+ << p.second.get_summary() << "\n";
+ }
+ }
+
+ if (auto& service_map = mgrstatmon()->get_service_map();
+ std::any_of(service_map.services.begin(),
+ service_map.services.end(),
+ [](auto& service) {
+ return service.second.has_running_tasks();
+ })) {
+ ss << "\n \n task status:\n";
+ for (auto& [name, service] : service_map.services) {
+ ss << service.get_task_summary(name);
+ }
+ }
+
+ ss << "\n \n data:\n";
+ mdsmon()->print_fs_summary(ss);
+ mgrstatmon()->print_summary(NULL, &ss);
+
+ auto& pem = mgrstatmon()->get_progress_events();
+ if (!pem.empty()) {
+ ss << "\n \n progress:\n";
+ for (auto& i : pem) {
+ if (i.second.add_to_ceph_s){
+ ss << " " << i.second.message << "\n";
+ }
+ }
+ }
+ ss << "\n ";
+ }
+}
+
+void Monitor::_generate_command_map(cmdmap_t& cmdmap,
+ map<string,string> &param_str_map)
+{
+ for (auto p = cmdmap.begin(); p != cmdmap.end(); ++p) {
+ if (p->first == "prefix")
+ continue;
+ if (p->first == "caps") {
+ vector<string> cv;
+ if (cmd_getval(cmdmap, "caps", cv) &&
+ cv.size() % 2 == 0) {
+ for (unsigned i = 0; i < cv.size(); i += 2) {
+ string k = string("caps_") + cv[i];
+ param_str_map[k] = cv[i + 1];
+ }
+ continue;
+ }
+ }
+ param_str_map[p->first] = cmd_vartype_stringify(p->second);
+ }
+}
+
+const MonCommand *Monitor::_get_moncommand(
+ const string &cmd_prefix,
+ const vector<MonCommand>& cmds)
+{
+ for (auto& c : cmds) {
+ if (c.cmdstring.compare(0, cmd_prefix.size(), cmd_prefix) == 0) {
+ return &c;
+ }
+ }
+ return nullptr;
+}
+
+bool Monitor::_allowed_command(MonSession *s, const string &module,
+ const string &prefix, const cmdmap_t& cmdmap,
+ const map<string,string>& param_str_map,
+ const MonCommand *this_cmd) {
+
+ bool cmd_r = this_cmd->requires_perm('r');
+ bool cmd_w = this_cmd->requires_perm('w');
+ bool cmd_x = this_cmd->requires_perm('x');
+
+ bool capable = s->caps.is_capable(
+ g_ceph_context,
+ s->entity_name,
+ module, prefix, param_str_map,
+ cmd_r, cmd_w, cmd_x,
+ s->get_peer_socket_addr());
+
+ dout(10) << __func__ << " " << (capable ? "" : "not ") << "capable" << dendl;
+ return capable;
+}
+
+void Monitor::format_command_descriptions(const std::vector<MonCommand> &commands,
+ Formatter *f,
+ uint64_t features,
+ bufferlist *rdata)
+{
+ int cmdnum = 0;
+ f->open_object_section("command_descriptions");
+ for (const auto &cmd : commands) {
+ unsigned flags = cmd.flags;
+ ostringstream secname;
+ secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
+ dump_cmddesc_to_json(f, features, secname.str(),
+ cmd.cmdstring, cmd.helpstring, cmd.module,
+ cmd.req_perms, flags);
+ cmdnum++;
+ }
+ f->close_section(); // command_descriptions
+
+ f->flush(*rdata);
+}
+
+bool Monitor::is_keyring_required()
+{
+ return auth_cluster_required.is_supported_auth(CEPH_AUTH_CEPHX) ||
+ auth_service_required.is_supported_auth(CEPH_AUTH_CEPHX) ||
+ auth_cluster_required.is_supported_auth(CEPH_AUTH_GSS) ||
+ auth_service_required.is_supported_auth(CEPH_AUTH_GSS);
+}
+
+struct C_MgrProxyCommand : public Context {
+ Monitor *mon;
+ MonOpRequestRef op;
+ uint64_t size;
+ bufferlist outbl;
+ string outs;
+ C_MgrProxyCommand(Monitor *mon, MonOpRequestRef op, uint64_t s)
+ : mon(mon), op(op), size(s) { }
+ void finish(int r) {
+ std::lock_guard l(mon->lock);
+ mon->mgr_proxy_bytes -= size;
+ mon->reply_command(op, r, outs, outbl, 0);
+ }
+};
+
+void Monitor::handle_tell_command(MonOpRequestRef op)
+{
+ ceph_assert(op->is_type_command());
+ MCommand *m = static_cast<MCommand*>(op->get_req());
+ if (m->fsid != monmap->fsid) {
+ dout(0) << "handle_command on fsid " << m->fsid << " != " << monmap->fsid << dendl;
+ return reply_tell_command(op, -EACCES, "wrong fsid");
+ }
+ MonSession *session = op->get_session();
+ if (!session) {
+ dout(5) << __func__ << " dropping stray message " << *m << dendl;
+ return;
+ }
+ cmdmap_t cmdmap;
+ if (stringstream ss; !cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ return reply_tell_command(op, -EINVAL, ss.str());
+ }
+ map<string,string> param_str_map;
+ _generate_command_map(cmdmap, param_str_map);
+ string prefix;
+ if (!cmd_getval(cmdmap, "prefix", prefix)) {
+ return reply_tell_command(op, -EINVAL, "no prefix");
+ }
+ if (auto cmd = _get_moncommand(prefix,
+ get_local_commands(quorum_mon_features));
+ cmd) {
+ if (cmd->is_obsolete() ||
+ (cct->_conf->mon_debug_deprecated_as_obsolete &&
+ cmd->is_deprecated())) {
+ return reply_tell_command(op, -ENOTSUP,
+ "command is obsolete; "
+ "please check usage and/or man page");
+ }
+ }
+ // see if command is allowed
+ if (!session->caps.is_capable(
+ g_ceph_context,
+ session->entity_name,
+ "mon", prefix, param_str_map,
+ true, true, true,
+ session->get_peer_socket_addr())) {
+ return reply_tell_command(op, -EACCES, "insufficient caps");
+ }
+ // pass it to asok
+ cct->get_admin_socket()->queue_tell_command(m);
+}
+
+void Monitor::handle_command(MonOpRequestRef op)
+{
+ ceph_assert(op->is_type_command());
+ auto m = op->get_req<MMonCommand>();
+ if (m->fsid != monmap->fsid) {
+ dout(0) << "handle_command on fsid " << m->fsid << " != " << monmap->fsid
+ << dendl;
+ reply_command(op, -EPERM, "wrong fsid", 0);
+ return;
+ }
+
+ MonSession *session = op->get_session();
+ if (!session) {
+ dout(5) << __func__ << " dropping stray message " << *m << dendl;
+ return;
+ }
+
+ if (m->cmd.empty()) {
+ reply_command(op, -EINVAL, "no command specified", 0);
+ return;
+ }
+
+ string prefix;
+ vector<string> fullcmd;
+ cmdmap_t cmdmap;
+ stringstream ss, ds;
+ bufferlist rdata;
+ string rs;
+ int r = -EINVAL;
+ rs = "unrecognized command";
+
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ // ss has reason for failure
+ r = -EINVAL;
+ rs = ss.str();
+ if (!m->get_source().is_mon()) // don't reply to mon->mon commands
+ reply_command(op, r, rs, 0);
+ return;
+ }
+
+ // check return value. If no prefix parameter provided,
+ // return value will be false, then return error info.
+ if (!cmd_getval(cmdmap, "prefix", prefix)) {
+ reply_command(op, -EINVAL, "command prefix not found", 0);
+ return;
+ }
+
+ // check prefix is empty
+ if (prefix.empty()) {
+ reply_command(op, -EINVAL, "command prefix must not be empty", 0);
+ return;
+ }
+
+ if (prefix == "get_command_descriptions") {
+ bufferlist rdata;
+ Formatter *f = Formatter::create("json");
+
+ std::vector<MonCommand> commands = static_cast<MgrMonitor*>(
+ paxos_service[PAXOS_MGR].get())->get_command_descs();
+
+ for (auto& c : leader_mon_commands) {
+ commands.push_back(c);
+ }
+
+ auto features = m->get_connection()->get_features();
+ format_command_descriptions(commands, f, features, &rdata);
+ delete f;
+ reply_command(op, 0, "", rdata, 0);
+ return;
+ }
+
+ dout(0) << "handle_command " << *m << dendl;
+
+ string format = cmd_getval_or<string>(cmdmap, "format", "plain");
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+ get_str_vec(prefix, fullcmd);
+
+ // make sure fullcmd is not empty.
+ // invalid prefix will cause empty vector fullcmd.
+ // such as, prefix=";,,;"
+ if (fullcmd.empty()) {
+ reply_command(op, -EINVAL, "command requires a prefix to be valid", 0);
+ return;
+ }
+
+ std::string_view module = fullcmd[0];
+
+ // validate command is in leader map
+
+ const MonCommand *leader_cmd;
+ const auto& mgr_cmds = mgrmon()->get_command_descs();
+ const MonCommand *mgr_cmd = nullptr;
+ if (!mgr_cmds.empty()) {
+ mgr_cmd = _get_moncommand(prefix, mgr_cmds);
+ }
+ leader_cmd = _get_moncommand(prefix, leader_mon_commands);
+ if (!leader_cmd) {
+ leader_cmd = mgr_cmd;
+ if (!leader_cmd) {
+ reply_command(op, -EINVAL, "command not known", 0);
+ return;
+ }
+ }
+ // validate command is in our map & matches, or forward if it is allowed
+ const MonCommand *mon_cmd = _get_moncommand(
+ prefix,
+ get_local_commands(quorum_mon_features));
+ if (!mon_cmd) {
+ mon_cmd = mgr_cmd;
+ }
+ if (!is_leader()) {
+ if (!mon_cmd) {
+ if (leader_cmd->is_noforward()) {
+ reply_command(op, -EINVAL,
+ "command not locally supported and not allowed to forward",
+ 0);
+ return;
+ }
+ dout(10) << "Command not locally supported, forwarding request "
+ << m << dendl;
+ forward_request_leader(op);
+ return;
+ } else if (!mon_cmd->is_compat(leader_cmd)) {
+ if (mon_cmd->is_noforward()) {
+ reply_command(op, -EINVAL,
+ "command not compatible with leader and not allowed to forward",
+ 0);
+ return;
+ }
+ dout(10) << "Command not compatible with leader, forwarding request "
+ << m << dendl;
+ forward_request_leader(op);
+ return;
+ }
+ }
+
+ if (mon_cmd->is_obsolete() ||
+ (cct->_conf->mon_debug_deprecated_as_obsolete
+ && mon_cmd->is_deprecated())) {
+ reply_command(op, -ENOTSUP,
+ "command is obsolete; please check usage and/or man page",
+ 0);
+ return;
+ }
+
+ if (session->proxy_con && mon_cmd->is_noforward()) {
+ dout(10) << "Got forward for noforward command " << m << dendl;
+ reply_command(op, -EINVAL, "forward for noforward command", rdata, 0);
+ return;
+ }
+
+ /* what we perceive as being the service the command falls under */
+ string service(mon_cmd->module);
+
+ dout(25) << __func__ << " prefix='" << prefix
+ << "' module='" << module
+ << "' service='" << service << "'" << dendl;
+
+ bool cmd_is_rw =
+ (mon_cmd->requires_perm('w') || mon_cmd->requires_perm('x'));
+
+ // validate user's permissions for requested command
+ map<string,string> param_str_map;
+
+ // Catch bad_cmd_get exception if _generate_command_map() throws it
+ try {
+ _generate_command_map(cmdmap, param_str_map);
+ } catch (const bad_cmd_get& e) {
+ reply_command(op, -EINVAL, e.what(), 0);
+ return;
+ }
+
+ if (!_allowed_command(session, service, prefix, cmdmap,
+ param_str_map, mon_cmd)) {
+ dout(1) << __func__ << " access denied" << dendl;
+ if (prefix != "config set" && prefix != "config-key set")
+ (cmd_is_rw ? audit_clog->info() : audit_clog->debug())
+ << "from='" << session->name << " " << session->addrs << "' "
+ << "entity='" << session->entity_name << "' "
+ << "cmd=" << m->cmd << ": access denied";
+ reply_command(op, -EACCES, "access denied", 0);
+ return;
+ }
+
+ if (prefix != "config set" && prefix != "config-key set")
+ (cmd_is_rw ? audit_clog->info() : audit_clog->debug())
+ << "from='" << session->name << " " << session->addrs << "' "
+ << "entity='" << session->entity_name << "' "
+ << "cmd=" << m->cmd << ": dispatch";
+
+ // compat kludge for legacy clients trying to tell commands that are
+ // new. see bottom of MonCommands.h. we need to handle both (1)
+ // pre-octopus clients and (2) octopus clients with a mix of pre-octopus
+ // and octopus mons.
+ if ((!HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS) ||
+ monmap->min_mon_release < ceph_release_t::octopus) &&
+ (prefix == "injectargs" ||
+ prefix == "smart" ||
+ prefix == "mon_status" ||
+ prefix == "heap")) {
+ if (m->get_connection()->get_messenger() == 0) {
+ // Prior to octopus, monitors might forward these messages
+ // around. that was broken at baseline, and if we try to process
+ // this message now, it will assert out when we try to send a
+ // message in reply from the asok/tell worker (see
+ // AnonConnection). Just reply with an error.
+ dout(5) << __func__ << " failing forwarded command from a (presumably) "
+ << "pre-octopus peer" << dendl;
+ reply_command(
+ op, -EBUSY,
+ "failing forwarded tell command in mixed-version mon cluster", 0);
+ return;
+ }
+ dout(5) << __func__ << " passing command to tell/asok" << dendl;
+ cct->get_admin_socket()->queue_tell_command(m);
+ return;
+ }
+
+ if (mon_cmd->is_mgr()) {
+ const auto& hdr = m->get_header();
+ uint64_t size = hdr.front_len + hdr.middle_len + hdr.data_len;
+ uint64_t max = g_conf().get_val<Option::size_t>("mon_client_bytes")
+ * g_conf().get_val<double>("mon_mgr_proxy_client_bytes_ratio");
+ if (mgr_proxy_bytes + size > max) {
+ dout(10) << __func__ << " current mgr proxy bytes " << mgr_proxy_bytes
+ << " + " << size << " > max " << max << dendl;
+ reply_command(op, -EAGAIN, "hit limit on proxied mgr commands", rdata, 0);
+ return;
+ }
+ mgr_proxy_bytes += size;
+ dout(10) << __func__ << " proxying mgr command (+" << size
+ << " -> " << mgr_proxy_bytes << ")" << dendl;
+ C_MgrProxyCommand *fin = new C_MgrProxyCommand(this, op, size);
+ mgr_client.start_command(m->cmd,
+ m->get_data(),
+ &fin->outbl,
+ &fin->outs,
+ new C_OnFinisher(fin, &finisher));
+ return;
+ }
+
+ if ((module == "mds" || module == "fs") &&
+ prefix != "fs authorize") {
+ mdsmon()->dispatch(op);
+ return;
+ }
+ if ((module == "osd" ||
+ prefix == "pg map" ||
+ prefix == "pg repeer") &&
+ prefix != "osd last-stat-seq") {
+ osdmon()->dispatch(op);
+ return;
+ }
+ if (module == "config") {
+ configmon()->dispatch(op);
+ return;
+ }
+
+ if (module == "mon" &&
+ /* Let the Monitor class handle the following commands:
+ * 'mon scrub'
+ */
+ prefix != "mon scrub" &&
+ prefix != "mon metadata" &&
+ prefix != "mon versions" &&
+ prefix != "mon count-metadata" &&
+ prefix != "mon ok-to-stop" &&
+ prefix != "mon ok-to-add-offline" &&
+ prefix != "mon ok-to-rm") {
+ monmon()->dispatch(op);
+ return;
+ }
+ if (module == "health" && prefix != "health") {
+ healthmon()->dispatch(op);
+ return;
+ }
+ if (module == "auth" || prefix == "fs authorize") {
+ authmon()->dispatch(op);
+ return;
+ }
+ if (module == "log") {
+ logmon()->dispatch(op);
+ return;
+ }
+
+ if (module == "config-key") {
+ kvmon()->dispatch(op);
+ return;
+ }
+
+ if (module == "mgr") {
+ mgrmon()->dispatch(op);
+ return;
+ }
+
+ if (prefix == "fsid") {
+ if (f) {
+ f->open_object_section("fsid");
+ f->dump_stream("fsid") << monmap->fsid;
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ds << monmap->fsid;
+ rdata.append(ds);
+ }
+ reply_command(op, 0, "", rdata, 0);
+ return;
+ }
+
+ if (prefix == "mon scrub") {
+ wait_for_paxos_write();
+ if (is_leader()) {
+ int r = scrub_start();
+ reply_command(op, r, "", rdata, 0);
+ } else if (is_peon()) {
+ forward_request_leader(op);
+ } else {
+ reply_command(op, -EAGAIN, "no quorum", rdata, 0);
+ }
+ return;
+ }
+
+ if (prefix == "time-sync-status") {
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ f->open_object_section("time_sync");
+ if (!timecheck_skews.empty()) {
+ f->open_object_section("time_skew_status");
+ for (auto& i : timecheck_skews) {
+ double skew = i.second;
+ double latency = timecheck_latencies[i.first];
+ string name = monmap->get_name(i.first);
+ ostringstream tcss;
+ health_status_t tcstatus = timecheck_status(tcss, skew, latency);
+ f->open_object_section(name.c_str());
+ f->dump_float("skew", skew);
+ f->dump_float("latency", latency);
+ f->dump_stream("health") << tcstatus;
+ if (tcstatus != HEALTH_OK) {
+ f->dump_stream("details") << tcss.str();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->open_object_section("timechecks");
+ f->dump_unsigned("epoch", get_epoch());
+ f->dump_int("round", timecheck_round);
+ f->dump_stream("round_status") << ((timecheck_round%2) ?
+ "on-going" : "finished");
+ f->close_section();
+ f->close_section();
+ f->flush(rdata);
+ r = 0;
+ rs = "";
+ } else if (prefix == "status" ||
+ prefix == "health" ||
+ prefix == "df") {
+ string detail;
+ cmd_getval(cmdmap, "detail", detail);
+
+ if (prefix == "status") {
+ // get_cluster_status handles f == NULL
+ get_cluster_status(ds, f.get(), session);
+
+ if (f) {
+ f->flush(ds);
+ ds << '\n';
+ }
+ rdata.append(ds);
+ } else if (prefix == "health") {
+ string plain;
+ healthmon()->get_health_status(detail == "detail", f.get(), f ? nullptr : &plain);
+ if (f) {
+ f->flush(ds);
+ rdata.append(ds);
+ } else {
+ rdata.append(plain);
+ }
+ } else if (prefix == "df") {
+ bool verbose = (detail == "detail");
+ if (f)
+ f->open_object_section("stats");
+
+ mgrstatmon()->dump_cluster_stats(&ds, f.get(), verbose);
+ if (!f) {
+ ds << "\n \n";
+ }
+ mgrstatmon()->dump_pool_stats(osdmon()->osdmap, &ds, f.get(), verbose);
+
+ if (f) {
+ f->close_section();
+ f->flush(ds);
+ ds << '\n';
+ }
+ } else {
+ ceph_abort_msg("We should never get here!");
+ return;
+ }
+ rdata.append(ds);
+ rs = "";
+ r = 0;
+ } else if (prefix == "report") {
+ // some of the report data is only known by leader, e.g. osdmap_clean_epochs
+ if (!is_leader() && !is_peon()) {
+ dout(10) << " waiting for quorum" << dendl;
+ waitfor_quorum.push_back(new C_RetryMessage(this, op));
+ return;
+ }
+ if (!is_leader()) {
+ forward_request_leader(op);
+ return;
+ }
+ // this must be formatted, in its current form
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ f->open_object_section("report");
+ f->dump_stream("cluster_fingerprint") << fingerprint;
+ f->dump_string("version", ceph_version_to_str());
+ f->dump_string("commit", git_version_to_str());
+ f->dump_stream("timestamp") << ceph_clock_now();
+
+ vector<string> tagsvec;
+ cmd_getval(cmdmap, "tags", tagsvec);
+ string tagstr = str_join(tagsvec, " ");
+ if (!tagstr.empty())
+ tagstr = tagstr.substr(0, tagstr.find_last_of(' '));
+ f->dump_string("tag", tagstr);
+
+ healthmon()->get_health_status(true, f.get(), nullptr);
+
+ monmon()->dump_info(f.get());
+ osdmon()->dump_info(f.get());
+ mdsmon()->dump_info(f.get());
+ authmon()->dump_info(f.get());
+ mgrstatmon()->dump_info(f.get());
+ logmon()->dump_info(f.get());
+
+ paxos->dump_info(f.get());
+
+ f->close_section();
+ f->flush(rdata);
+
+ ostringstream ss2;
+ ss2 << "report " << rdata.crc32c(CEPH_MON_PORT_LEGACY);
+ rs = ss2.str();
+ r = 0;
+ } else if (prefix == "osd last-stat-seq") {
+ int64_t osd = 0;
+ cmd_getval(cmdmap, "id", osd);
+ uint64_t seq = mgrstatmon()->get_last_osd_stat_seq(osd);
+ if (f) {
+ f->dump_unsigned("seq", seq);
+ f->flush(ds);
+ } else {
+ ds << seq;
+ rdata.append(ds);
+ }
+ rs = "";
+ r = 0;
+ } else if (prefix == "node ls") {
+ string node_type("all");
+ cmd_getval(cmdmap, "type", node_type);
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ if (node_type == "all") {
+ f->open_object_section("nodes");
+ print_nodes(f.get(), ds);
+ osdmon()->print_nodes(f.get());
+ mdsmon()->print_nodes(f.get());
+ mgrmon()->print_nodes(f.get());
+ f->close_section();
+ } else if (node_type == "mon") {
+ print_nodes(f.get(), ds);
+ } else if (node_type == "osd") {
+ osdmon()->print_nodes(f.get());
+ } else if (node_type == "mds") {
+ mdsmon()->print_nodes(f.get());
+ } else if (node_type == "mgr") {
+ mgrmon()->print_nodes(f.get());
+ }
+ f->flush(ds);
+ rdata.append(ds);
+ rs = "";
+ r = 0;
+ } else if (prefix == "features") {
+ if (!is_leader() && !is_peon()) {
+ dout(10) << " waiting for quorum" << dendl;
+ waitfor_quorum.push_back(new C_RetryMessage(this, op));
+ return;
+ }
+ if (!is_leader()) {
+ forward_request_leader(op);
+ return;
+ }
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ FeatureMap fm;
+ get_combined_feature_map(&fm);
+ f->dump_object("features", fm);
+ f->flush(rdata);
+ rs = "";
+ r = 0;
+ } else if (prefix == "mon metadata") {
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+
+ string name;
+ bool all = !cmd_getval(cmdmap, "id", name);
+ if (!all) {
+ // Dump a single mon's metadata
+ int mon = monmap->get_rank(name);
+ if (mon < 0) {
+ rs = "requested mon not found";
+ r = -ENOENT;
+ goto out;
+ }
+ f->open_object_section("mon_metadata");
+ r = get_mon_metadata(mon, f.get(), ds);
+ f->close_section();
+ } else {
+ // Dump all mons' metadata
+ r = 0;
+ f->open_array_section("mon_metadata");
+ for (unsigned int rank = 0; rank < monmap->size(); ++rank) {
+ std::ostringstream get_err;
+ f->open_object_section("mon");
+ f->dump_string("name", monmap->get_name(rank));
+ r = get_mon_metadata(rank, f.get(), get_err);
+ f->close_section();
+ if (r == -ENOENT || r == -EINVAL) {
+ dout(1) << get_err.str() << dendl;
+ // Drop error, list what metadata we do have
+ r = 0;
+ } else if (r != 0) {
+ derr << "Unexpected error from get_mon_metadata: "
+ << cpp_strerror(r) << dendl;
+ ds << get_err.str();
+ break;
+ }
+ }
+ f->close_section();
+ }
+
+ f->flush(ds);
+ rdata.append(ds);
+ rs = "";
+ } else if (prefix == "mon versions") {
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ count_metadata("ceph_version", f.get());
+ f->flush(ds);
+ rdata.append(ds);
+ rs = "";
+ r = 0;
+ } else if (prefix == "mon count-metadata") {
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ string field;
+ cmd_getval(cmdmap, "property", field);
+ count_metadata(field, f.get());
+ f->flush(ds);
+ rdata.append(ds);
+ rs = "";
+ r = 0;
+ } else if (prefix == "quorum_status") {
+ // make sure our map is readable and up to date
+ if (!is_leader() && !is_peon()) {
+ dout(10) << " waiting for quorum" << dendl;
+ waitfor_quorum.push_back(new C_RetryMessage(this, op));
+ return;
+ }
+ _quorum_status(f.get(), ds);
+ rdata.append(ds);
+ rs = "";
+ r = 0;
+ } else if (prefix == "mon ok-to-stop") {
+ vector<string> ids, invalid_ids;
+ if (!cmd_getval(cmdmap, "ids", ids)) {
+ r = -EINVAL;
+ goto out;
+ }
+ set<string> wouldbe;
+ for (auto rank : quorum) {
+ wouldbe.insert(monmap->get_name(rank));
+ }
+ for (auto& n : ids) {
+ if (monmap->contains(n)) {
+ wouldbe.erase(n);
+ } else {
+ invalid_ids.push_back(n);
+ }
+ }
+ if (!invalid_ids.empty()) {
+ r = 0;
+ rs = "invalid mon(s) specified: " + stringify(invalid_ids);
+ goto out;
+ }
+
+ if (wouldbe.size() < monmap->min_quorum_size()) {
+ r = -EBUSY;
+ rs = "not enough monitors would be available (" + stringify(wouldbe) +
+ ") after stopping mons " + stringify(ids);
+ goto out;
+ }
+ r = 0;
+ rs = "quorum should be preserved (" + stringify(wouldbe) +
+ ") after stopping " + stringify(ids);
+ } else if (prefix == "mon ok-to-add-offline") {
+ if (quorum.size() < monmap->min_quorum_size(monmap->size() + 1)) {
+ rs = "adding a monitor may break quorum (until that monitor starts)";
+ r = -EBUSY;
+ goto out;
+ }
+ rs = "adding another mon that is not yet online will not break quorum";
+ r = 0;
+ } else if (prefix == "mon ok-to-rm") {
+ string id;
+ if (!cmd_getval(cmdmap, "id", id)) {
+ r = -EINVAL;
+ rs = "must specify a monitor id";
+ goto out;
+ }
+ if (!monmap->contains(id)) {
+ r = 0;
+ rs = "mon." + id + " does not exist";
+ goto out;
+ }
+ int rank = monmap->get_rank(id);
+ if (quorum.count(rank) &&
+ quorum.size() - 1 < monmap->min_quorum_size(monmap->size() - 1)) {
+ r = -EBUSY;
+ rs = "removing mon." + id + " would break quorum";
+ goto out;
+ }
+ r = 0;
+ rs = "safe to remove mon." + id;
+ } else if (prefix == "version") {
+ if (f) {
+ f->open_object_section("version");
+ f->dump_string("version", pretty_version_to_str());
+ f->close_section();
+ f->flush(ds);
+ } else {
+ ds << pretty_version_to_str();
+ }
+ rdata.append(ds);
+ rs = "";
+ r = 0;
+ } else if (prefix == "versions") {
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ map<string,int> overall;
+ f->open_object_section("version");
+ map<string,int> mon, mgr, osd, mds;
+
+ count_metadata("ceph_version", &mon);
+ f->open_object_section("mon");
+ for (auto& p : mon) {
+ f->dump_int(p.first.c_str(), p.second);
+ overall[p.first] += p.second;
+ }
+ f->close_section();
+
+ mgrmon()->count_metadata("ceph_version", &mgr);
+ if (!mgr.empty()) {
+ f->open_object_section("mgr");
+ for (auto& p : mgr) {
+ f->dump_int(p.first.c_str(), p.second);
+ overall[p.first] += p.second;
+ }
+ f->close_section();
+ }
+
+ osdmon()->count_metadata("ceph_version", &osd);
+ if (!osd.empty()) {
+ f->open_object_section("osd");
+ for (auto& p : osd) {
+ f->dump_int(p.first.c_str(), p.second);
+ overall[p.first] += p.second;
+ }
+ f->close_section();
+ }
+
+ mdsmon()->count_metadata("ceph_version", &mds);
+ if (!mds.empty()) {
+ f->open_object_section("mds");
+ for (auto& p : mds) {
+ f->dump_int(p.first.c_str(), p.second);
+ overall[p.first] += p.second;
+ }
+ f->close_section();
+ }
+
+ for (auto& p : mgrstatmon()->get_service_map().services) {
+ auto &service = p.first;
+ if (ServiceMap::is_normal_ceph_entity(service)) {
+ continue;
+ }
+ f->open_object_section(service.c_str());
+ map<string,int> m;
+ p.second.count_metadata("ceph_version", &m);
+ for (auto& q : m) {
+ f->dump_int(q.first.c_str(), q.second);
+ overall[q.first] += q.second;
+ }
+ f->close_section();
+ }
+
+ f->open_object_section("overall");
+ for (auto& p : overall) {
+ f->dump_int(p.first.c_str(), p.second);
+ }
+ f->close_section();
+ f->close_section();
+ f->flush(rdata);
+ rs = "";
+ r = 0;
+ }
+
+ out:
+ if (!m->get_source().is_mon()) // don't reply to mon->mon commands
+ reply_command(op, r, rs, rdata, 0);
+}
+
+void Monitor::reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version)
+{
+ bufferlist rdata;
+ reply_command(op, rc, rs, rdata, version);
+}
+
+void Monitor::reply_command(MonOpRequestRef op, int rc, const string &rs,
+ bufferlist& rdata, version_t version)
+{
+ auto m = op->get_req<MMonCommand>();
+ ceph_assert(m->get_type() == MSG_MON_COMMAND);
+ MMonCommandAck *reply = new MMonCommandAck(m->cmd, rc, rs, version);
+ reply->set_tid(m->get_tid());
+ reply->set_data(rdata);
+ send_reply(op, reply);
+}
+
+void Monitor::reply_tell_command(
+ MonOpRequestRef op, int rc, const string &rs)
+{
+ MCommand *m = static_cast<MCommand*>(op->get_req());
+ ceph_assert(m->get_type() == MSG_COMMAND);
+ MCommandReply *reply = new MCommandReply(rc, rs);
+ reply->set_tid(m->get_tid());
+ m->get_connection()->send_message(reply);
+}
+
+
+// ------------------------
+// request/reply routing
+//
+// a client/mds/osd will connect to a random monitor. we need to forward any
+// messages requiring state updates to the leader, and then route any replies
+// back via the correct monitor and back to them. (the monitor will not
+// initiate any connections.)
+
+void Monitor::forward_request_leader(MonOpRequestRef op)
+{
+ op->mark_event(__func__);
+
+ int mon = get_leader();
+ MonSession *session = op->get_session();
+ PaxosServiceMessage *req = op->get_req<PaxosServiceMessage>();
+
+ if (req->get_source().is_mon() && req->get_source_addrs() != messenger->get_myaddrs()) {
+ dout(10) << "forward_request won't forward (non-local) mon request " << *req << dendl;
+ } else if (session->proxy_con) {
+ dout(10) << "forward_request won't double fwd request " << *req << dendl;
+ } else if (!session->closed) {
+ RoutedRequest *rr = new RoutedRequest;
+ rr->tid = ++routed_request_tid;
+ rr->con = req->get_connection();
+ rr->con_features = rr->con->get_features();
+ encode_message(req, CEPH_FEATURES_ALL, rr->request_bl); // for my use only; use all features
+ rr->session = static_cast<MonSession *>(session->get());
+ rr->op = op;
+ routed_requests[rr->tid] = rr;
+ session->routed_request_tids.insert(rr->tid);
+
+ dout(10) << "forward_request " << rr->tid << " request " << *req
+ << " features " << rr->con_features << dendl;
+
+ MForward *forward = new MForward(rr->tid,
+ req,
+ rr->con_features,
+ rr->session->caps);
+ forward->set_priority(req->get_priority());
+ if (session->auth_handler) {
+ forward->entity_name = session->entity_name;
+ } else if (req->get_source().is_mon()) {
+ forward->entity_name.set_type(CEPH_ENTITY_TYPE_MON);
+ }
+ send_mon_message(forward, mon);
+ op->mark_forwarded();
+ ceph_assert(op->get_req()->get_type() != 0);
+ } else {
+ dout(10) << "forward_request no session for request " << *req << dendl;
+ }
+}
+
+// fake connection attached to forwarded messages
+struct AnonConnection : public Connection {
+ entity_addr_t socket_addr;
+
+ int send_message(Message *m) override {
+ ceph_assert(!"send_message on anonymous connection");
+ }
+ void send_keepalive() override {
+ ceph_assert(!"send_keepalive on anonymous connection");
+ }
+ void mark_down() override {
+ // silently ignore
+ }
+ void mark_disposable() override {
+ // silengtly ignore
+ }
+ bool is_connected() override { return false; }
+ entity_addr_t get_peer_socket_addr() const override {
+ return socket_addr;
+ }
+
+private:
+ FRIEND_MAKE_REF(AnonConnection);
+ explicit AnonConnection(CephContext *cct, const entity_addr_t& sa)
+ : Connection(cct, nullptr),
+ socket_addr(sa) {}
+};
+
+//extract the original message and put it into the regular dispatch function
+void Monitor::handle_forward(MonOpRequestRef op)
+{
+ auto m = op->get_req<MForward>();
+ dout(10) << "received forwarded message from "
+ << ceph_entity_type_name(m->client_type)
+ << " " << m->client_addrs
+ << " via " << m->get_source_inst() << dendl;
+ MonSession *session = op->get_session();
+ ceph_assert(session);
+
+ if (!session->is_capable("mon", MON_CAP_X)) {
+ dout(0) << "forward from entity with insufficient caps! "
+ << session->caps << dendl;
+ } else {
+ // see PaxosService::dispatch(); we rely on this being anon
+ // (c->msgr == NULL)
+ PaxosServiceMessage *req = m->claim_message();
+ ceph_assert(req != NULL);
+
+ auto c = ceph::make_ref<AnonConnection>(cct, m->client_socket_addr);
+ MonSession *s = new MonSession(static_cast<Connection*>(c.get()));
+ s->_ident(req->get_source(),
+ req->get_source_addrs());
+ c->set_priv(RefCountedPtr{s, false});
+ c->set_peer_addrs(m->client_addrs);
+ c->set_peer_type(m->client_type);
+ c->set_features(m->con_features);
+
+ s->authenticated = true;
+ s->caps = m->client_caps;
+ dout(10) << " caps are " << s->caps << dendl;
+ s->entity_name = m->entity_name;
+ dout(10) << " entity name '" << s->entity_name << "' type "
+ << s->entity_name.get_type() << dendl;
+ s->proxy_con = m->get_connection();
+ s->proxy_tid = m->tid;
+
+ req->set_connection(c);
+
+ // not super accurate, but better than nothing.
+ req->set_recv_stamp(m->get_recv_stamp());
+
+ /*
+ * note which election epoch this is; we will drop the message if
+ * there is a future election since our peers will resend routed
+ * requests in that case.
+ */
+ req->rx_election_epoch = get_epoch();
+
+ dout(10) << " mesg " << req << " from " << m->get_source_addr() << dendl;
+ _ms_dispatch(req);
+
+ // break the session <-> con ref loop by removing the con->session
+ // reference, which is no longer needed once the MonOpRequest is
+ // set up.
+ c->set_priv(NULL);
+ }
+}
+
+void Monitor::send_reply(MonOpRequestRef op, Message *reply)
+{
+ op->mark_event(__func__);
+
+ MonSession *session = op->get_session();
+ ceph_assert(session);
+ Message *req = op->get_req();
+ ConnectionRef con = op->get_connection();
+
+ reply->set_cct(g_ceph_context);
+ dout(2) << __func__ << " " << op << " " << reply << " " << *reply << dendl;
+
+ if (!con) {
+ dout(2) << "send_reply no connection, dropping reply " << *reply
+ << " to " << req << " " << *req << dendl;
+ reply->put();
+ op->mark_event("reply: no connection");
+ return;
+ }
+
+ if (!session->con && !session->proxy_con) {
+ dout(2) << "send_reply no connection, dropping reply " << *reply
+ << " to " << req << " " << *req << dendl;
+ reply->put();
+ op->mark_event("reply: no connection");
+ return;
+ }
+
+ if (session->proxy_con) {
+ dout(15) << "send_reply routing reply to " << con->get_peer_addr()
+ << " via " << session->proxy_con->get_peer_addr()
+ << " for request " << *req << dendl;
+ session->proxy_con->send_message(new MRoute(session->proxy_tid, reply));
+ op->mark_event("reply: send routed request");
+ } else {
+ session->con->send_message(reply);
+ op->mark_event("reply: send");
+ }
+}
+
+void Monitor::no_reply(MonOpRequestRef op)
+{
+ MonSession *session = op->get_session();
+ Message *req = op->get_req();
+
+ if (session->proxy_con) {
+ dout(10) << "no_reply to " << req->get_source_inst()
+ << " via " << session->proxy_con->get_peer_addr()
+ << " for request " << *req << dendl;
+ session->proxy_con->send_message(new MRoute(session->proxy_tid, NULL));
+ op->mark_event("no_reply: send routed request");
+ } else {
+ dout(10) << "no_reply to " << req->get_source_inst()
+ << " " << *req << dendl;
+ op->mark_event("no_reply");
+ }
+}
+
+void Monitor::handle_route(MonOpRequestRef op)
+{
+ auto m = op->get_req<MRoute>();
+ MonSession *session = op->get_session();
+ //check privileges
+ if (!session->is_capable("mon", MON_CAP_X)) {
+ dout(0) << "MRoute received from entity without appropriate perms! "
+ << dendl;
+ return;
+ }
+ if (m->msg)
+ dout(10) << "handle_route tid " << m->session_mon_tid << " " << *m->msg
+ << dendl;
+ else
+ dout(10) << "handle_route tid " << m->session_mon_tid << " null" << dendl;
+
+ // look it up
+ if (!m->session_mon_tid) {
+ dout(10) << " not a routed request, ignoring" << dendl;
+ return;
+ }
+ auto found = routed_requests.find(m->session_mon_tid);
+ if (found == routed_requests.end()) {
+ dout(10) << " don't have routed request tid " << m->session_mon_tid << dendl;
+ return;
+ }
+ std::unique_ptr<RoutedRequest> rr{found->second};
+ // reset payload, in case encoding is dependent on target features
+ if (m->msg) {
+ m->msg->clear_payload();
+ rr->con->send_message(m->msg);
+ m->msg = NULL;
+ }
+ if (m->send_osdmap_first) {
+ dout(10) << " sending osdmaps from " << m->send_osdmap_first << dendl;
+ osdmon()->send_incremental(m->send_osdmap_first, rr->session,
+ true, MonOpRequestRef());
+ }
+ ceph_assert(rr->tid == m->session_mon_tid && rr->session->routed_request_tids.count(m->session_mon_tid));
+ routed_requests.erase(found);
+ rr->session->routed_request_tids.erase(m->session_mon_tid);
+}
+
+void Monitor::resend_routed_requests()
+{
+ dout(10) << "resend_routed_requests" << dendl;
+ int mon = get_leader();
+ list<Context*> retry;
+ for (map<uint64_t, RoutedRequest*>::iterator p = routed_requests.begin();
+ p != routed_requests.end();
+ ++p) {
+ RoutedRequest *rr = p->second;
+
+ if (mon == rank) {
+ dout(10) << " requeue for self tid " << rr->tid << dendl;
+ rr->op->mark_event("retry routed request");
+ retry.push_back(new C_RetryMessage(this, rr->op));
+ if (rr->session) {
+ ceph_assert(rr->session->routed_request_tids.count(p->first));
+ rr->session->routed_request_tids.erase(p->first);
+ }
+ delete rr;
+ } else {
+ auto q = rr->request_bl.cbegin();
+ PaxosServiceMessage *req =
+ (PaxosServiceMessage *)decode_message(cct, 0, q);
+ rr->op->mark_event("resend forwarded message to leader");
+ dout(10) << " resend to mon." << mon << " tid " << rr->tid << " " << *req
+ << dendl;
+ MForward *forward = new MForward(rr->tid,
+ req,
+ rr->con_features,
+ rr->session->caps);
+ req->put(); // forward takes its own ref; drop ours.
+ forward->client_type = rr->con->get_peer_type();
+ forward->client_addrs = rr->con->get_peer_addrs();
+ forward->client_socket_addr = rr->con->get_peer_socket_addr();
+ forward->set_priority(req->get_priority());
+ send_mon_message(forward, mon);
+ }
+ }
+ if (mon == rank) {
+ routed_requests.clear();
+ finish_contexts(g_ceph_context, retry);
+ }
+}
+
+void Monitor::remove_session(MonSession *s)
+{
+ dout(10) << "remove_session " << s << " " << s->name << " " << s->addrs
+ << " features 0x" << std::hex << s->con_features << std::dec << dendl;
+ ceph_assert(s->con);
+ ceph_assert(!s->closed);
+ for (set<uint64_t>::iterator p = s->routed_request_tids.begin();
+ p != s->routed_request_tids.end();
+ ++p) {
+ ceph_assert(routed_requests.count(*p));
+ RoutedRequest *rr = routed_requests[*p];
+ dout(10) << " dropping routed request " << rr->tid << dendl;
+ delete rr;
+ routed_requests.erase(*p);
+ }
+ s->routed_request_tids.clear();
+ s->con->set_priv(nullptr);
+ session_map.remove_session(s);
+ logger->set(l_mon_num_sessions, session_map.get_size());
+ logger->inc(l_mon_session_rm);
+}
+
+void Monitor::remove_all_sessions()
+{
+ std::lock_guard l(session_map_lock);
+ while (!session_map.sessions.empty()) {
+ MonSession *s = session_map.sessions.front();
+ remove_session(s);
+ logger->inc(l_mon_session_rm);
+ }
+ if (logger)
+ logger->set(l_mon_num_sessions, session_map.get_size());
+}
+
+void Monitor::send_mon_message(Message *m, int rank)
+{
+ messenger->send_to_mon(m, monmap->get_addrs(rank));
+}
+
+void Monitor::waitlist_or_zap_client(MonOpRequestRef op)
+{
+ /**
+ * Wait list the new session until we're in the quorum, assuming it's
+ * sufficiently new.
+ * tick() will periodically send them back through so we can send
+ * the client elsewhere if we don't think we're getting back in.
+ *
+ * But we allow a few sorts of messages:
+ * 1) Monitors can talk to us at any time, of course.
+ * 2) auth messages. It's unlikely to go through much faster, but
+ * it's possible we've just lost our quorum status and we want to take...
+ * 3) command messages. We want to accept these under all possible
+ * circumstances.
+ */
+ Message *m = op->get_req();
+ MonSession *s = op->get_session();
+ ConnectionRef con = op->get_connection();
+ utime_t too_old = ceph_clock_now();
+ too_old -= g_ceph_context->_conf->mon_lease;
+ if (m->get_recv_stamp() > too_old &&
+ con->is_connected()) {
+ dout(5) << "waitlisting message " << *m << dendl;
+ maybe_wait_for_quorum.push_back(new C_RetryMessage(this, op));
+ op->mark_wait_for_quorum();
+ } else {
+ dout(5) << "discarding message " << *m << " and sending client elsewhere" << dendl;
+ con->mark_down();
+ // proxied sessions aren't registered and don't have a con; don't remove
+ // those.
+ if (!s->proxy_con) {
+ std::lock_guard l(session_map_lock);
+ remove_session(s);
+ }
+ op->mark_zap();
+ }
+}
+
+void Monitor::_ms_dispatch(Message *m)
+{
+ if (is_shutdown()) {
+ m->put();
+ return;
+ }
+
+ MonOpRequestRef op = op_tracker.create_request<MonOpRequest>(m);
+ bool src_is_mon = op->is_src_mon();
+ op->mark_event("mon:_ms_dispatch");
+ MonSession *s = op->get_session();
+ if (s && s->closed) {
+ return;
+ }
+
+ if (src_is_mon && s) {
+ ConnectionRef con = m->get_connection();
+ if (con->get_messenger() && con->get_features() != s->con_features) {
+ // only update features if this is a non-anonymous connection
+ dout(10) << __func__ << " feature change for " << m->get_source_inst()
+ << " (was " << s->con_features
+ << ", now " << con->get_features() << ")" << dendl;
+ // connection features changed - recreate session.
+ if (s->con && s->con != con) {
+ dout(10) << __func__ << " connection for " << m->get_source_inst()
+ << " changed from session; mark down and replace" << dendl;
+ s->con->mark_down();
+ }
+ if (s->item.is_on_list()) {
+ // forwarded messages' sessions are not in the sessions map and
+ // exist only while the op is being handled.
+ std::lock_guard l(session_map_lock);
+ remove_session(s);
+ }
+ s = nullptr;
+ }
+ }
+
+ if (!s) {
+ // if the sender is not a monitor, make sure their first message for a
+ // session is an MAuth. If it is not, assume it's a stray message,
+ // and considering that we are creating a new session it is safe to
+ // assume that the sender hasn't authenticated yet, so we have no way
+ // of assessing whether we should handle it or not.
+ if (!src_is_mon && (m->get_type() != CEPH_MSG_AUTH &&
+ m->get_type() != CEPH_MSG_MON_GET_MAP &&
+ m->get_type() != CEPH_MSG_PING)) {
+ dout(1) << __func__ << " dropping stray message " << *m
+ << " from " << m->get_source_inst() << dendl;
+ return;
+ }
+
+ ConnectionRef con = m->get_connection();
+ {
+ std::lock_guard l(session_map_lock);
+ s = session_map.new_session(m->get_source(),
+ m->get_source_addrs(),
+ con.get());
+ }
+ ceph_assert(s);
+ con->set_priv(RefCountedPtr{s, false});
+ dout(10) << __func__ << " new session " << s << " " << *s
+ << " features 0x" << std::hex
+ << s->con_features << std::dec << dendl;
+ op->set_session(s);
+
+ logger->set(l_mon_num_sessions, session_map.get_size());
+ logger->inc(l_mon_session_add);
+
+ if (src_is_mon) {
+ // give it monitor caps; the peer type has been authenticated
+ dout(5) << __func__ << " setting monitor caps on this connection" << dendl;
+ if (!s->caps.is_allow_all()) // but no need to repeatedly copy
+ s->caps = mon_caps;
+ s->authenticated = true;
+ }
+ } else {
+ dout(20) << __func__ << " existing session " << s << " for " << s->name
+ << dendl;
+ }
+
+ ceph_assert(s);
+
+ s->session_timeout = ceph_clock_now();
+ s->session_timeout += g_conf()->mon_session_timeout;
+
+ if (s->auth_handler) {
+ s->entity_name = s->auth_handler->get_entity_name();
+ s->global_id = s->auth_handler->get_global_id();
+ s->global_id_status = s->auth_handler->get_global_id_status();
+ }
+ dout(20) << " entity_name " << s->entity_name
+ << " global_id " << s->global_id
+ << " (" << s->global_id_status
+ << ") caps " << s->caps.get_str() << dendl;
+
+ if (!session_stretch_allowed(s, op)) {
+ return;
+ }
+ if ((is_synchronizing() ||
+ (!s->authenticated && !exited_quorum.is_zero())) &&
+ !src_is_mon &&
+ m->get_type() != CEPH_MSG_PING) {
+ waitlist_or_zap_client(op);
+ } else {
+ dispatch_op(op);
+ }
+ return;
+}
+
+void Monitor::dispatch_op(MonOpRequestRef op)
+{
+ op->mark_event("mon:dispatch_op");
+ MonSession *s = op->get_session();
+ ceph_assert(s);
+ if (s->closed) {
+ dout(10) << " session closed, dropping " << op->get_req() << dendl;
+ return;
+ }
+
+ /* we will consider the default type as being 'monitor' until proven wrong */
+ op->set_type_monitor();
+ /* deal with all messages that do not necessarily need caps */
+ switch (op->get_req()->get_type()) {
+ // auth
+ case MSG_MON_GLOBAL_ID:
+ case MSG_MON_USED_PENDING_KEYS:
+ case CEPH_MSG_AUTH:
+ op->set_type_service();
+ /* no need to check caps here */
+ paxos_service[PAXOS_AUTH]->dispatch(op);
+ return;
+
+ case CEPH_MSG_PING:
+ handle_ping(op);
+ return;
+ case MSG_COMMAND:
+ op->set_type_command();
+ handle_tell_command(op);
+ return;
+ }
+
+ if (!op->get_session()->authenticated) {
+ dout(5) << __func__ << " " << op->get_req()->get_source_inst()
+ << " is not authenticated, dropping " << *(op->get_req())
+ << dendl;
+ return;
+ }
+
+ // global_id_status == NONE: all sessions for auth_none and krb,
+ // mon <-> mon sessions (including proxied sessions) for cephx
+ ceph_assert(s->global_id_status == global_id_status_t::NONE ||
+ s->global_id_status == global_id_status_t::NEW_OK ||
+ s->global_id_status == global_id_status_t::NEW_NOT_EXPOSED ||
+ s->global_id_status == global_id_status_t::RECLAIM_OK ||
+ s->global_id_status == global_id_status_t::RECLAIM_INSECURE);
+
+ // let mon_getmap through for "ping" (which doesn't reconnect)
+ // and "tell" (which reconnects but doesn't attempt to preserve
+ // its global_id and stays in NEW_NOT_EXPOSED, retrying until
+ // ->send_attempts reaches 0)
+ if (cct->_conf->auth_expose_insecure_global_id_reclaim &&
+ s->global_id_status == global_id_status_t::NEW_NOT_EXPOSED &&
+ op->get_req()->get_type() != CEPH_MSG_MON_GET_MAP) {
+ dout(5) << __func__ << " " << op->get_req()->get_source_inst()
+ << " may omit old_ticket on reconnects, discarding "
+ << *op->get_req() << " and forcing reconnect" << dendl;
+ ceph_assert(s->con && !s->proxy_con);
+ s->con->mark_down();
+ {
+ std::lock_guard l(session_map_lock);
+ remove_session(s);
+ }
+ op->mark_zap();
+ return;
+ }
+
+ switch (op->get_req()->get_type()) {
+ case CEPH_MSG_MON_GET_MAP:
+ handle_mon_get_map(op);
+ return;
+
+ case MSG_GET_CONFIG:
+ configmon()->handle_get_config(op);
+ return;
+
+ case CEPH_MSG_MON_SUBSCRIBE:
+ /* FIXME: check what's being subscribed, filter accordingly */
+ handle_subscribe(op);
+ return;
+ }
+
+ /* well, maybe the op belongs to a service... */
+ op->set_type_service();
+ /* deal with all messages which caps should be checked somewhere else */
+ switch (op->get_req()->get_type()) {
+
+ // OSDs
+ case CEPH_MSG_MON_GET_OSDMAP:
+ case CEPH_MSG_POOLOP:
+ case MSG_OSD_BEACON:
+ case MSG_OSD_MARK_ME_DOWN:
+ case MSG_OSD_MARK_ME_DEAD:
+ case MSG_OSD_FULL:
+ case MSG_OSD_FAILURE:
+ case MSG_OSD_BOOT:
+ case MSG_OSD_ALIVE:
+ case MSG_OSD_PGTEMP:
+ case MSG_OSD_PG_CREATED:
+ case MSG_REMOVE_SNAPS:
+ case MSG_MON_GET_PURGED_SNAPS:
+ case MSG_OSD_PG_READY_TO_MERGE:
+ paxos_service[PAXOS_OSDMAP]->dispatch(op);
+ return;
+
+ // MDSs
+ case MSG_MDS_BEACON:
+ case MSG_MDS_OFFLOAD_TARGETS:
+ paxos_service[PAXOS_MDSMAP]->dispatch(op);
+ return;
+
+ // Mgrs
+ case MSG_MGR_BEACON:
+ paxos_service[PAXOS_MGR]->dispatch(op);
+ return;
+
+ // MgrStat
+ case MSG_MON_MGR_REPORT:
+ case CEPH_MSG_STATFS:
+ case MSG_GETPOOLSTATS:
+ paxos_service[PAXOS_MGRSTAT]->dispatch(op);
+ return;
+
+ // log
+ case MSG_LOG:
+ paxos_service[PAXOS_LOG]->dispatch(op);
+ return;
+
+ // handle_command() does its own caps checking
+ case MSG_MON_COMMAND:
+ op->set_type_command();
+ handle_command(op);
+ return;
+ }
+
+ /* nop, looks like it's not a service message; revert back to monitor */
+ op->set_type_monitor();
+
+ /* messages we, the Monitor class, need to deal with
+ * but may be sent by clients. */
+
+ if (!op->get_session()->is_capable("mon", MON_CAP_R)) {
+ dout(5) << __func__ << " " << op->get_req()->get_source_inst()
+ << " not enough caps for " << *(op->get_req()) << " -- dropping"
+ << dendl;
+ return;
+ }
+
+ switch (op->get_req()->get_type()) {
+ // misc
+ case CEPH_MSG_MON_GET_VERSION:
+ handle_get_version(op);
+ return;
+ }
+
+ if (!op->is_src_mon()) {
+ dout(1) << __func__ << " unexpected monitor message from"
+ << " non-monitor entity " << op->get_req()->get_source_inst()
+ << " " << *(op->get_req()) << " -- dropping" << dendl;
+ return;
+ }
+
+ /* messages that should only be sent by another monitor */
+ switch (op->get_req()->get_type()) {
+
+ case MSG_ROUTE:
+ handle_route(op);
+ return;
+
+ case MSG_MON_PROBE:
+ handle_probe(op);
+ return;
+
+ // Sync (i.e., the new slurp, but on steroids)
+ case MSG_MON_SYNC:
+ handle_sync(op);
+ return;
+ case MSG_MON_SCRUB:
+ handle_scrub(op);
+ return;
+
+ /* log acks are sent from a monitor we sent the MLog to, and are
+ never sent by clients to us. */
+ case MSG_LOGACK:
+ log_client.handle_log_ack((MLogAck*)op->get_req());
+ return;
+
+ // monmap
+ case MSG_MON_JOIN:
+ op->set_type_service();
+ paxos_service[PAXOS_MONMAP]->dispatch(op);
+ return;
+
+ // paxos
+ case MSG_MON_PAXOS:
+ {
+ op->set_type_paxos();
+ auto pm = op->get_req<MMonPaxos>();
+ if (!op->get_session()->is_capable("mon", MON_CAP_X)) {
+ //can't send these!
+ return;
+ }
+
+ if (state == STATE_SYNCHRONIZING) {
+ // we are synchronizing. These messages would do us no
+ // good, thus just drop them and ignore them.
+ dout(10) << __func__ << " ignore paxos msg from "
+ << pm->get_source_inst() << dendl;
+ return;
+ }
+
+ // sanitize
+ if (pm->epoch > get_epoch()) {
+ bootstrap();
+ return;
+ }
+ if (pm->epoch != get_epoch()) {
+ return;
+ }
+
+ paxos->dispatch(op);
+ }
+ return;
+
+ // elector messages
+ case MSG_MON_ELECTION:
+ op->set_type_election_or_ping();
+ //check privileges here for simplicity
+ if (!op->get_session()->is_capable("mon", MON_CAP_X)) {
+ dout(0) << "MMonElection received from entity without enough caps!"
+ << op->get_session()->caps << dendl;
+ return;;
+ }
+ if (!is_probing() && !is_synchronizing()) {
+ elector.dispatch(op);
+ }
+ return;
+
+ case MSG_MON_PING:
+ op->set_type_election_or_ping();
+ elector.dispatch(op);
+ return;
+
+ case MSG_FORWARD:
+ handle_forward(op);
+ return;
+
+ case MSG_TIMECHECK:
+ dout(5) << __func__ << " ignoring " << op << dendl;
+ return;
+ case MSG_TIMECHECK2:
+ handle_timecheck(op);
+ return;
+
+ case MSG_MON_HEALTH:
+ dout(5) << __func__ << " dropping deprecated message: "
+ << *op->get_req() << dendl;
+ break;
+ case MSG_MON_HEALTH_CHECKS:
+ op->set_type_service();
+ paxos_service[PAXOS_HEALTH]->dispatch(op);
+ return;
+ }
+ dout(1) << "dropping unexpected " << *(op->get_req()) << dendl;
+ return;
+}
+
+void Monitor::handle_ping(MonOpRequestRef op)
+{
+ auto m = op->get_req<MPing>();
+ dout(10) << __func__ << " " << *m << dendl;
+ MPing *reply = new MPing;
+ bufferlist payload;
+ boost::scoped_ptr<Formatter> f(new JSONFormatter(true));
+ f->open_object_section("pong");
+
+ healthmon()->get_health_status(false, f.get(), nullptr);
+ get_mon_status(f.get());
+
+ f->close_section();
+ stringstream ss;
+ f->flush(ss);
+ encode(ss.str(), payload);
+ reply->set_payload(payload);
+ dout(10) << __func__ << " reply payload len " << reply->get_payload().length() << dendl;
+ m->get_connection()->send_message(reply);
+}
+
+void Monitor::timecheck_start()
+{
+ dout(10) << __func__ << dendl;
+ timecheck_cleanup();
+ if (get_quorum_mon_features().contains_all(
+ ceph::features::mon::FEATURE_NAUTILUS)) {
+ timecheck_start_round();
+ }
+}
+
+void Monitor::timecheck_finish()
+{
+ dout(10) << __func__ << dendl;
+ timecheck_cleanup();
+}
+
+void Monitor::timecheck_start_round()
+{
+ dout(10) << __func__ << " curr " << timecheck_round << dendl;
+ ceph_assert(is_leader());
+
+ if (monmap->size() == 1) {
+ ceph_abort_msg("We are alone; this shouldn't have been scheduled!");
+ return;
+ }
+
+ if (timecheck_round % 2) {
+ dout(10) << __func__ << " there's a timecheck going on" << dendl;
+ utime_t curr_time = ceph_clock_now();
+ double max = g_conf()->mon_timecheck_interval*3;
+ if (curr_time - timecheck_round_start < max) {
+ dout(10) << __func__ << " keep current round going" << dendl;
+ goto out;
+ } else {
+ dout(10) << __func__
+ << " finish current timecheck and start new" << dendl;
+ timecheck_cancel_round();
+ }
+ }
+
+ ceph_assert(timecheck_round % 2 == 0);
+ timecheck_acks = 0;
+ timecheck_round ++;
+ timecheck_round_start = ceph_clock_now();
+ dout(10) << __func__ << " new " << timecheck_round << dendl;
+
+ timecheck();
+out:
+ dout(10) << __func__ << " setting up next event" << dendl;
+ timecheck_reset_event();
+}
+
+void Monitor::timecheck_finish_round(bool success)
+{
+ dout(10) << __func__ << " curr " << timecheck_round << dendl;
+ ceph_assert(timecheck_round % 2);
+ timecheck_round ++;
+ timecheck_round_start = utime_t();
+
+ if (success) {
+ ceph_assert(timecheck_waiting.empty());
+ ceph_assert(timecheck_acks == quorum.size());
+ timecheck_report();
+ timecheck_check_skews();
+ return;
+ }
+
+ dout(10) << __func__ << " " << timecheck_waiting.size()
+ << " peers still waiting:";
+ for (auto& p : timecheck_waiting) {
+ *_dout << " mon." << p.first;
+ }
+ *_dout << dendl;
+ timecheck_waiting.clear();
+
+ dout(10) << __func__ << " finished to " << timecheck_round << dendl;
+}
+
+void Monitor::timecheck_cancel_round()
+{
+ timecheck_finish_round(false);
+}
+
+void Monitor::timecheck_cleanup()
+{
+ timecheck_round = 0;
+ timecheck_acks = 0;
+ timecheck_round_start = utime_t();
+
+ if (timecheck_event) {
+ timer.cancel_event(timecheck_event);
+ timecheck_event = NULL;
+ }
+ timecheck_waiting.clear();
+ timecheck_skews.clear();
+ timecheck_latencies.clear();
+
+ timecheck_rounds_since_clean = 0;
+}
+
+void Monitor::timecheck_reset_event()
+{
+ if (timecheck_event) {
+ timer.cancel_event(timecheck_event);
+ timecheck_event = NULL;
+ }
+
+ double delay =
+ cct->_conf->mon_timecheck_skew_interval * timecheck_rounds_since_clean;
+
+ if (delay <= 0 || delay > cct->_conf->mon_timecheck_interval) {
+ delay = cct->_conf->mon_timecheck_interval;
+ }
+
+ dout(10) << __func__ << " delay " << delay
+ << " rounds_since_clean " << timecheck_rounds_since_clean
+ << dendl;
+
+ timecheck_event = timer.add_event_after(
+ delay,
+ new C_MonContext{this, [this](int) {
+ timecheck_start_round();
+ }});
+}
+
+void Monitor::timecheck_check_skews()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(is_leader());
+ ceph_assert((timecheck_round % 2) == 0);
+ if (monmap->size() == 1) {
+ ceph_abort_msg("We are alone; we shouldn't have gotten here!");
+ return;
+ }
+ ceph_assert(timecheck_latencies.size() == timecheck_skews.size());
+
+ bool found_skew = false;
+ for (auto& p : timecheck_skews) {
+ double abs_skew;
+ if (timecheck_has_skew(p.second, &abs_skew)) {
+ dout(10) << __func__
+ << " " << p.first << " skew " << abs_skew << dendl;
+ found_skew = true;
+ }
+ }
+
+ if (found_skew) {
+ ++timecheck_rounds_since_clean;
+ timecheck_reset_event();
+ } else if (timecheck_rounds_since_clean > 0) {
+ dout(1) << __func__
+ << " no clock skews found after " << timecheck_rounds_since_clean
+ << " rounds" << dendl;
+ // make sure the skews are really gone and not just a transient success
+ // this will run just once if not in the presence of skews again.
+ timecheck_rounds_since_clean = 1;
+ timecheck_reset_event();
+ timecheck_rounds_since_clean = 0;
+ }
+
+}
+
+void Monitor::timecheck_report()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(is_leader());
+ ceph_assert((timecheck_round % 2) == 0);
+ if (monmap->size() == 1) {
+ ceph_abort_msg("We are alone; we shouldn't have gotten here!");
+ return;
+ }
+
+ ceph_assert(timecheck_latencies.size() == timecheck_skews.size());
+ bool do_output = true; // only output report once
+ for (set<int>::iterator q = quorum.begin(); q != quorum.end(); ++q) {
+ if (monmap->get_name(*q) == name)
+ continue;
+
+ MTimeCheck2 *m = new MTimeCheck2(MTimeCheck2::OP_REPORT);
+ m->epoch = get_epoch();
+ m->round = timecheck_round;
+
+ for (auto& it : timecheck_skews) {
+ double skew = it.second;
+ double latency = timecheck_latencies[it.first];
+
+ m->skews[it.first] = skew;
+ m->latencies[it.first] = latency;
+
+ if (do_output) {
+ dout(25) << __func__ << " mon." << it.first
+ << " latency " << latency
+ << " skew " << skew << dendl;
+ }
+ }
+ do_output = false;
+ dout(10) << __func__ << " send report to mon." << *q << dendl;
+ send_mon_message(m, *q);
+ }
+}
+
+void Monitor::timecheck()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(is_leader());
+ if (monmap->size() == 1) {
+ ceph_abort_msg("We are alone; we shouldn't have gotten here!");
+ return;
+ }
+ ceph_assert(timecheck_round % 2 != 0);
+
+ timecheck_acks = 1; // we ack ourselves
+
+ dout(10) << __func__ << " start timecheck epoch " << get_epoch()
+ << " round " << timecheck_round << dendl;
+
+ // we are at the eye of the storm; the point of reference
+ timecheck_skews[rank] = 0.0;
+ timecheck_latencies[rank] = 0.0;
+
+ for (set<int>::iterator it = quorum.begin(); it != quorum.end(); ++it) {
+ if (monmap->get_name(*it) == name)
+ continue;
+
+ utime_t curr_time = ceph_clock_now();
+ timecheck_waiting[*it] = curr_time;
+ MTimeCheck2 *m = new MTimeCheck2(MTimeCheck2::OP_PING);
+ m->epoch = get_epoch();
+ m->round = timecheck_round;
+ dout(10) << __func__ << " send " << *m << " to mon." << *it << dendl;
+ send_mon_message(m, *it);
+ }
+}
+
+health_status_t Monitor::timecheck_status(ostringstream &ss,
+ const double skew_bound,
+ const double latency)
+{
+ health_status_t status = HEALTH_OK;
+ ceph_assert(latency >= 0);
+
+ double abs_skew;
+ if (timecheck_has_skew(skew_bound, &abs_skew)) {
+ status = HEALTH_WARN;
+ ss << "clock skew " << abs_skew << "s"
+ << " > max " << g_conf()->mon_clock_drift_allowed << "s";
+ }
+
+ return status;
+}
+
+void Monitor::handle_timecheck_leader(MonOpRequestRef op)
+{
+ auto m = op->get_req<MTimeCheck2>();
+ dout(10) << __func__ << " " << *m << dendl;
+ /* handles PONG's */
+ ceph_assert(m->op == MTimeCheck2::OP_PONG);
+
+ int other = m->get_source().num();
+ if (m->epoch < get_epoch()) {
+ dout(1) << __func__ << " got old timecheck epoch " << m->epoch
+ << " from " << other
+ << " curr " << get_epoch()
+ << " -- severely lagged? discard" << dendl;
+ return;
+ }
+ ceph_assert(m->epoch == get_epoch());
+
+ if (m->round < timecheck_round) {
+ dout(1) << __func__ << " got old round " << m->round
+ << " from " << other
+ << " curr " << timecheck_round << " -- discard" << dendl;
+ return;
+ }
+
+ utime_t curr_time = ceph_clock_now();
+
+ ceph_assert(timecheck_waiting.count(other) > 0);
+ utime_t timecheck_sent = timecheck_waiting[other];
+ timecheck_waiting.erase(other);
+ if (curr_time < timecheck_sent) {
+ // our clock was readjusted -- drop everything until it all makes sense.
+ dout(1) << __func__ << " our clock was readjusted --"
+ << " bump round and drop current check"
+ << dendl;
+ timecheck_cancel_round();
+ return;
+ }
+
+ /* update peer latencies */
+ double latency = (double)(curr_time - timecheck_sent);
+
+ if (timecheck_latencies.count(other) == 0)
+ timecheck_latencies[other] = latency;
+ else {
+ double avg_latency = ((timecheck_latencies[other]*0.8)+(latency*0.2));
+ timecheck_latencies[other] = avg_latency;
+ }
+
+ /*
+ * update skews
+ *
+ * some nasty thing goes on if we were to do 'a - b' between two utime_t,
+ * and 'a' happens to be lower than 'b'; so we use double instead.
+ *
+ * latency is always expected to be >= 0.
+ *
+ * delta, the difference between theirs timestamp and ours, may either be
+ * lower or higher than 0; will hardly ever be 0.
+ *
+ * The absolute skew is the absolute delta minus the latency, which is
+ * taken as a whole instead of an rtt given that there is some queueing
+ * and dispatch times involved and it's hard to assess how long exactly
+ * it took for the message to travel to the other side and be handled. So
+ * we call it a bounded skew, the worst case scenario.
+ *
+ * Now, to math!
+ *
+ * Given that the latency is always positive, we can establish that the
+ * bounded skew will be:
+ *
+ * 1. positive if the absolute delta is higher than the latency and
+ * delta is positive
+ * 2. negative if the absolute delta is higher than the latency and
+ * delta is negative.
+ * 3. zero if the absolute delta is lower than the latency.
+ *
+ * On 3. we make a judgement call and treat the skew as non-existent.
+ * This is because that, if the absolute delta is lower than the
+ * latency, then the apparently existing skew is nothing more than a
+ * side-effect of the high latency at work.
+ *
+ * This may not be entirely true though, as a severely skewed clock
+ * may be masked by an even higher latency, but with high latencies
+ * we probably have worse issues to deal with than just skewed clocks.
+ */
+ ceph_assert(latency >= 0);
+
+ double delta = ((double) m->timestamp) - ((double) curr_time);
+ double abs_delta = (delta > 0 ? delta : -delta);
+ double skew_bound = abs_delta - latency;
+ if (skew_bound < 0)
+ skew_bound = 0;
+ else if (delta < 0)
+ skew_bound = -skew_bound;
+
+ ostringstream ss;
+ health_status_t status = timecheck_status(ss, skew_bound, latency);
+ if (status != HEALTH_OK) {
+ clog->health(status) << other << " " << ss.str();
+ }
+
+ dout(10) << __func__ << " from " << other << " ts " << m->timestamp
+ << " delta " << delta << " skew_bound " << skew_bound
+ << " latency " << latency << dendl;
+
+ timecheck_skews[other] = skew_bound;
+
+ timecheck_acks++;
+ if (timecheck_acks == quorum.size()) {
+ dout(10) << __func__ << " got pongs from everybody ("
+ << timecheck_acks << " total)" << dendl;
+ ceph_assert(timecheck_skews.size() == timecheck_acks);
+ ceph_assert(timecheck_waiting.empty());
+ // everyone has acked, so bump the round to finish it.
+ timecheck_finish_round();
+ }
+}
+
+void Monitor::handle_timecheck_peon(MonOpRequestRef op)
+{
+ auto m = op->get_req<MTimeCheck2>();
+ dout(10) << __func__ << " " << *m << dendl;
+
+ ceph_assert(is_peon());
+ ceph_assert(m->op == MTimeCheck2::OP_PING || m->op == MTimeCheck2::OP_REPORT);
+
+ if (m->epoch != get_epoch()) {
+ dout(1) << __func__ << " got wrong epoch "
+ << "(ours " << get_epoch()
+ << " theirs: " << m->epoch << ") -- discarding" << dendl;
+ return;
+ }
+
+ if (m->round < timecheck_round) {
+ dout(1) << __func__ << " got old round " << m->round
+ << " current " << timecheck_round
+ << " (epoch " << get_epoch() << ") -- discarding" << dendl;
+ return;
+ }
+
+ timecheck_round = m->round;
+
+ if (m->op == MTimeCheck2::OP_REPORT) {
+ ceph_assert((timecheck_round % 2) == 0);
+ timecheck_latencies.swap(m->latencies);
+ timecheck_skews.swap(m->skews);
+ return;
+ }
+
+ ceph_assert((timecheck_round % 2) != 0);
+ MTimeCheck2 *reply = new MTimeCheck2(MTimeCheck2::OP_PONG);
+ utime_t curr_time = ceph_clock_now();
+ reply->timestamp = curr_time;
+ reply->epoch = m->epoch;
+ reply->round = m->round;
+ dout(10) << __func__ << " send " << *m
+ << " to " << m->get_source_inst() << dendl;
+ m->get_connection()->send_message(reply);
+}
+
+void Monitor::handle_timecheck(MonOpRequestRef op)
+{
+ auto m = op->get_req<MTimeCheck2>();
+ dout(10) << __func__ << " " << *m << dendl;
+
+ if (is_leader()) {
+ if (m->op != MTimeCheck2::OP_PONG) {
+ dout(1) << __func__ << " drop unexpected msg (not pong)" << dendl;
+ } else {
+ handle_timecheck_leader(op);
+ }
+ } else if (is_peon()) {
+ if (m->op != MTimeCheck2::OP_PING && m->op != MTimeCheck2::OP_REPORT) {
+ dout(1) << __func__ << " drop unexpected msg (not ping or report)" << dendl;
+ } else {
+ handle_timecheck_peon(op);
+ }
+ } else {
+ dout(1) << __func__ << " drop unexpected msg" << dendl;
+ }
+}
+
+void Monitor::handle_subscribe(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonSubscribe>();
+ dout(10) << "handle_subscribe " << *m << dendl;
+
+ bool reply = false;
+
+ MonSession *s = op->get_session();
+ ceph_assert(s);
+
+ if (m->hostname.size()) {
+ s->remote_host = m->hostname;
+ }
+
+ for (map<string,ceph_mon_subscribe_item>::iterator p = m->what.begin();
+ p != m->what.end();
+ ++p) {
+ if (p->first == "monmap" || p->first == "config") {
+ // these require no caps
+ } else if (!s->is_capable("mon", MON_CAP_R)) {
+ dout(5) << __func__ << " " << op->get_req()->get_source_inst()
+ << " not enough caps for " << *(op->get_req()) << " -- dropping"
+ << dendl;
+ continue;
+ }
+
+ // if there are any non-onetime subscriptions, we need to reply to start the resubscribe timer
+ if ((p->second.flags & CEPH_SUBSCRIBE_ONETIME) == 0)
+ reply = true;
+
+ // remove conflicting subscribes
+ if (logmon()->sub_name_to_id(p->first) >= 0) {
+ for (map<string, Subscription*>::iterator it = s->sub_map.begin();
+ it != s->sub_map.end(); ) {
+ if (it->first != p->first && logmon()->sub_name_to_id(it->first) >= 0) {
+ std::lock_guard l(session_map_lock);
+ session_map.remove_sub((it++)->second);
+ } else {
+ ++it;
+ }
+ }
+ }
+
+ {
+ std::lock_guard l(session_map_lock);
+ session_map.add_update_sub(s, p->first, p->second.start,
+ p->second.flags & CEPH_SUBSCRIBE_ONETIME,
+ m->get_connection()->has_feature(CEPH_FEATURE_INCSUBOSDMAP));
+ }
+
+ if (p->first.compare(0, 6, "mdsmap") == 0 || p->first.compare(0, 5, "fsmap") == 0) {
+ dout(10) << __func__ << ": MDS sub '" << p->first << "'" << dendl;
+ if ((int)s->is_capable("mds", MON_CAP_R)) {
+ Subscription *sub = s->sub_map[p->first];
+ ceph_assert(sub != nullptr);
+ mdsmon()->check_sub(sub);
+ }
+ } else if (p->first == "osdmap") {
+ if ((int)s->is_capable("osd", MON_CAP_R)) {
+ if (s->osd_epoch > p->second.start) {
+ // client needs earlier osdmaps on purpose, so reset the sent epoch
+ s->osd_epoch = 0;
+ }
+ osdmon()->check_osdmap_sub(s->sub_map["osdmap"]);
+ }
+ } else if (p->first == "osd_pg_creates") {
+ if ((int)s->is_capable("osd", MON_CAP_W)) {
+ osdmon()->check_pg_creates_sub(s->sub_map["osd_pg_creates"]);
+ }
+ } else if (p->first == "monmap") {
+ monmon()->check_sub(s->sub_map[p->first]);
+ } else if (logmon()->sub_name_to_id(p->first) >= 0) {
+ logmon()->check_sub(s->sub_map[p->first]);
+ } else if (p->first == "mgrmap" || p->first == "mgrdigest") {
+ mgrmon()->check_sub(s->sub_map[p->first]);
+ } else if (p->first == "servicemap") {
+ mgrstatmon()->check_sub(s->sub_map[p->first]);
+ } else if (p->first == "config") {
+ configmon()->check_sub(s);
+ } else if (p->first.find("kv:") == 0) {
+ kvmon()->check_sub(s->sub_map[p->first]);
+ }
+ }
+
+ if (reply) {
+ // we only need to reply if the client is old enough to think it
+ // has to send renewals.
+ ConnectionRef con = m->get_connection();
+ if (!con->has_feature(CEPH_FEATURE_MON_STATEFUL_SUB))
+ m->get_connection()->send_message(new MMonSubscribeAck(
+ monmap->get_fsid(), (int)g_conf()->mon_subscribe_interval));
+ }
+
+}
+
+void Monitor::handle_get_version(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonGetVersion>();
+ dout(10) << "handle_get_version " << *m << dendl;
+ PaxosService *svc = NULL;
+
+ MonSession *s = op->get_session();
+ ceph_assert(s);
+
+ if (!is_leader() && !is_peon()) {
+ dout(10) << " waiting for quorum" << dendl;
+ waitfor_quorum.push_back(new C_RetryMessage(this, op));
+ goto out;
+ }
+
+ if (m->what == "mdsmap") {
+ svc = mdsmon();
+ } else if (m->what == "fsmap") {
+ svc = mdsmon();
+ } else if (m->what == "osdmap") {
+ svc = osdmon();
+ } else if (m->what == "monmap") {
+ svc = monmon();
+ } else {
+ derr << "invalid map type " << m->what << dendl;
+ }
+
+ if (svc) {
+ if (!svc->is_readable()) {
+ svc->wait_for_readable(op, new C_RetryMessage(this, op));
+ goto out;
+ }
+
+ MMonGetVersionReply *reply = new MMonGetVersionReply();
+ reply->handle = m->handle;
+ reply->version = svc->get_last_committed();
+ reply->oldest_version = svc->get_first_committed();
+ reply->set_tid(m->get_tid());
+
+ m->get_connection()->send_message(reply);
+ }
+ out:
+ return;
+}
+
+bool Monitor::ms_handle_reset(Connection *con)
+{
+ dout(10) << "ms_handle_reset " << con << " " << con->get_peer_addr() << dendl;
+
+ // ignore lossless monitor sessions
+ if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON)
+ return false;
+
+ auto priv = con->get_priv();
+ auto s = static_cast<MonSession*>(priv.get());
+ if (!s)
+ return false;
+
+ // break any con <-> session ref cycle
+ s->con->set_priv(nullptr);
+
+ if (is_shutdown())
+ return false;
+
+ std::lock_guard l(lock);
+
+ dout(10) << "reset/close on session " << s->name << " " << s->addrs << dendl;
+ if (!s->closed && s->item.is_on_list()) {
+ std::lock_guard l(session_map_lock);
+ remove_session(s);
+ }
+ return true;
+}
+
+bool Monitor::ms_handle_refused(Connection *con)
+{
+ // just log for now...
+ dout(10) << "ms_handle_refused " << con << " " << con->get_peer_addr() << dendl;
+ return false;
+}
+
+// -----
+
+void Monitor::send_latest_monmap(Connection *con)
+{
+ bufferlist bl;
+ monmap->encode(bl, con->get_features());
+ con->send_message(new MMonMap(bl));
+}
+
+void Monitor::handle_mon_get_map(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonGetMap>();
+ dout(10) << "handle_mon_get_map" << dendl;
+ send_latest_monmap(m->get_connection().get());
+}
+
+int Monitor::load_metadata()
+{
+ bufferlist bl;
+ int r = store->get(MONITOR_STORE_PREFIX, "last_metadata", bl);
+ if (r)
+ return r;
+ auto it = bl.cbegin();
+ decode(mon_metadata, it);
+
+ pending_metadata = mon_metadata;
+ return 0;
+}
+
+int Monitor::get_mon_metadata(int mon, Formatter *f, ostream& err)
+{
+ ceph_assert(f);
+ if (!mon_metadata.count(mon)) {
+ err << "mon." << mon << " not found";
+ return -EINVAL;
+ }
+ const Metadata& m = mon_metadata[mon];
+ for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
+ f->dump_string(p->first.c_str(), p->second);
+ }
+ return 0;
+}
+
+void Monitor::count_metadata(const string& field, map<string,int> *out)
+{
+ for (auto& p : mon_metadata) {
+ auto q = p.second.find(field);
+ if (q == p.second.end()) {
+ (*out)["unknown"]++;
+ } else {
+ (*out)[q->second]++;
+ }
+ }
+}
+
+void Monitor::count_metadata(const string& field, Formatter *f)
+{
+ map<string,int> by_val;
+ count_metadata(field, &by_val);
+ f->open_object_section(field.c_str());
+ for (auto& p : by_val) {
+ f->dump_int(p.first.c_str(), p.second);
+ }
+ f->close_section();
+}
+
+void Monitor::get_all_versions(std::map<string, list<string> > &versions)
+{
+ // mon
+ get_versions(versions);
+ // osd
+ osdmon()->get_versions(versions);
+ // mgr
+ mgrmon()->get_versions(versions);
+ // mds
+ mdsmon()->get_versions(versions);
+ dout(20) << __func__ << " all versions=" << versions << dendl;
+}
+
+void Monitor::get_versions(std::map<string, list<string> > &versions)
+{
+ for (auto& [rank, metadata] : mon_metadata) {
+ auto q = metadata.find("ceph_version_short");
+ if (q == metadata.end()) {
+ // not likely
+ continue;
+ }
+ versions[q->second].push_back(string("mon.") + monmap->get_name(rank));
+ }
+}
+
+int Monitor::print_nodes(Formatter *f, ostream& err)
+{
+ map<string, list<string> > mons; // hostname => mon
+ for (map<int, Metadata>::iterator it = mon_metadata.begin();
+ it != mon_metadata.end(); ++it) {
+ const Metadata& m = it->second;
+ Metadata::const_iterator hostname = m.find("hostname");
+ if (hostname == m.end()) {
+ // not likely though
+ continue;
+ }
+ mons[hostname->second].push_back(monmap->get_name(it->first));
+ }
+
+ dump_services(f, mons, "mon");
+ return 0;
+}
+
+// ----------------------------------------------
+// scrub
+
+int Monitor::scrub_start()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(is_leader());
+
+ if (!scrub_result.empty()) {
+ clog->info() << "scrub already in progress";
+ return -EBUSY;
+ }
+
+ scrub_event_cancel();
+ scrub_result.clear();
+ scrub_state.reset(new ScrubState);
+
+ scrub();
+ return 0;
+}
+
+int Monitor::scrub()
+{
+ ceph_assert(is_leader());
+ ceph_assert(scrub_state);
+
+ scrub_cancel_timeout();
+ wait_for_paxos_write();
+ scrub_version = paxos->get_version();
+
+
+ // scrub all keys if we're the only monitor in the quorum
+ int32_t num_keys =
+ (quorum.size() == 1 ? -1 : cct->_conf->mon_scrub_max_keys);
+
+ for (set<int>::iterator p = quorum.begin();
+ p != quorum.end();
+ ++p) {
+ if (*p == rank)
+ continue;
+ MMonScrub *r = new MMonScrub(MMonScrub::OP_SCRUB, scrub_version,
+ num_keys);
+ r->key = scrub_state->last_key;
+ send_mon_message(r, *p);
+ }
+
+ // scrub my keys
+ bool r = _scrub(&scrub_result[rank],
+ &scrub_state->last_key,
+ &num_keys);
+
+ scrub_state->finished = !r;
+
+ // only after we got our scrub results do we really care whether the
+ // other monitors are late on their results. Also, this way we avoid
+ // triggering the timeout if we end up getting stuck in _scrub() for
+ // longer than the duration of the timeout.
+ scrub_reset_timeout();
+
+ if (quorum.size() == 1) {
+ ceph_assert(scrub_state->finished == true);
+ scrub_finish();
+ }
+ return 0;
+}
+
+void Monitor::handle_scrub(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonScrub>();
+ dout(10) << __func__ << " " << *m << dendl;
+ switch (m->op) {
+ case MMonScrub::OP_SCRUB:
+ {
+ if (!is_peon())
+ break;
+
+ wait_for_paxos_write();
+
+ if (m->version != paxos->get_version())
+ break;
+
+ MMonScrub *reply = new MMonScrub(MMonScrub::OP_RESULT,
+ m->version,
+ m->num_keys);
+
+ reply->key = m->key;
+ _scrub(&reply->result, &reply->key, &reply->num_keys);
+ m->get_connection()->send_message(reply);
+ }
+ break;
+
+ case MMonScrub::OP_RESULT:
+ {
+ if (!is_leader())
+ break;
+ if (m->version != scrub_version)
+ break;
+ // reset the timeout each time we get a result
+ scrub_reset_timeout();
+
+ int from = m->get_source().num();
+ ceph_assert(scrub_result.count(from) == 0);
+ scrub_result[from] = m->result;
+
+ if (scrub_result.size() == quorum.size()) {
+ scrub_check_results();
+ scrub_result.clear();
+ if (scrub_state->finished)
+ scrub_finish();
+ else
+ scrub();
+ }
+ }
+ break;
+ }
+}
+
+bool Monitor::_scrub(ScrubResult *r,
+ pair<string,string> *start,
+ int *num_keys)
+{
+ ceph_assert(r != NULL);
+ ceph_assert(start != NULL);
+ ceph_assert(num_keys != NULL);
+
+ set<string> prefixes = get_sync_targets_names();
+ prefixes.erase("paxos"); // exclude paxos, as this one may have extra states for proposals, etc.
+
+ dout(10) << __func__ << " start (" << *start << ")"
+ << " num_keys " << *num_keys << dendl;
+
+ MonitorDBStore::Synchronizer it = store->get_synchronizer(*start, prefixes);
+
+ int scrubbed_keys = 0;
+ pair<string,string> last_key;
+
+ while (it->has_next_chunk()) {
+
+ if (*num_keys > 0 && scrubbed_keys == *num_keys)
+ break;
+
+ pair<string,string> k = it->get_next_key();
+ if (prefixes.count(k.first) == 0)
+ continue;
+
+ if (cct->_conf->mon_scrub_inject_missing_keys > 0.0 &&
+ (rand() % 10000 < cct->_conf->mon_scrub_inject_missing_keys*10000.0)) {
+ dout(10) << __func__ << " inject missing key, skipping (" << k << ")"
+ << dendl;
+ continue;
+ }
+
+ bufferlist bl;
+ int err = store->get(k.first, k.second, bl);
+ ceph_assert(err == 0);
+
+ uint32_t key_crc = bl.crc32c(0);
+ dout(30) << __func__ << " " << k << " bl " << bl.length() << " bytes"
+ << " crc " << key_crc << dendl;
+ r->prefix_keys[k.first]++;
+ if (r->prefix_crc.count(k.first) == 0) {
+ r->prefix_crc[k.first] = 0;
+ }
+ r->prefix_crc[k.first] = bl.crc32c(r->prefix_crc[k.first]);
+
+ if (cct->_conf->mon_scrub_inject_crc_mismatch > 0.0 &&
+ (rand() % 10000 < cct->_conf->mon_scrub_inject_crc_mismatch*10000.0)) {
+ dout(10) << __func__ << " inject failure at (" << k << ")" << dendl;
+ r->prefix_crc[k.first] += 1;
+ }
+
+ ++scrubbed_keys;
+ last_key = k;
+ }
+
+ dout(20) << __func__ << " last_key (" << last_key << ")"
+ << " scrubbed_keys " << scrubbed_keys
+ << " has_next " << it->has_next_chunk() << dendl;
+
+ *start = last_key;
+ *num_keys = scrubbed_keys;
+
+ return it->has_next_chunk();
+}
+
+void Monitor::scrub_check_results()
+{
+ dout(10) << __func__ << dendl;
+
+ // compare
+ int errors = 0;
+ ScrubResult& mine = scrub_result[rank];
+ for (map<int,ScrubResult>::iterator p = scrub_result.begin();
+ p != scrub_result.end();
+ ++p) {
+ if (p->first == rank)
+ continue;
+ if (p->second != mine) {
+ ++errors;
+ clog->error() << "scrub mismatch";
+ clog->error() << " mon." << rank << " " << mine;
+ clog->error() << " mon." << p->first << " " << p->second;
+ }
+ }
+ if (!errors)
+ clog->debug() << "scrub ok on " << quorum << ": " << mine;
+}
+
+inline void Monitor::scrub_timeout()
+{
+ dout(1) << __func__ << " restarting scrub" << dendl;
+ scrub_reset();
+ scrub_start();
+}
+
+void Monitor::scrub_finish()
+{
+ dout(10) << __func__ << dendl;
+ scrub_reset();
+ scrub_event_start();
+}
+
+void Monitor::scrub_reset()
+{
+ dout(10) << __func__ << dendl;
+ scrub_cancel_timeout();
+ scrub_version = 0;
+ scrub_result.clear();
+ scrub_state.reset();
+}
+
+inline void Monitor::scrub_update_interval(ceph::timespan interval)
+{
+ // we don't care about changes if we are not the leader.
+ // changes will be visible if we become the leader.
+ if (!is_leader())
+ return;
+
+ dout(1) << __func__ << " new interval = " << interval << dendl;
+
+ // if scrub already in progress, all changes will already be visible during
+ // the next round. Nothing to do.
+ if (scrub_state != NULL)
+ return;
+
+ scrub_event_cancel();
+ scrub_event_start();
+}
+
+void Monitor::scrub_event_start()
+{
+ dout(10) << __func__ << dendl;
+
+ if (scrub_event)
+ scrub_event_cancel();
+
+ auto scrub_interval =
+ cct->_conf.get_val<std::chrono::seconds>("mon_scrub_interval");
+ if (scrub_interval == std::chrono::seconds::zero()) {
+ dout(1) << __func__ << " scrub event is disabled"
+ << " (mon_scrub_interval = " << scrub_interval
+ << ")" << dendl;
+ return;
+ }
+
+ scrub_event = timer.add_event_after(
+ scrub_interval,
+ new C_MonContext{this, [this](int) {
+ scrub_start();
+ }});
+}
+
+void Monitor::scrub_event_cancel()
+{
+ dout(10) << __func__ << dendl;
+ if (scrub_event) {
+ timer.cancel_event(scrub_event);
+ scrub_event = NULL;
+ }
+}
+
+inline void Monitor::scrub_cancel_timeout()
+{
+ if (scrub_timeout_event) {
+ timer.cancel_event(scrub_timeout_event);
+ scrub_timeout_event = NULL;
+ }
+}
+
+void Monitor::scrub_reset_timeout()
+{
+ dout(15) << __func__ << " reset timeout event" << dendl;
+ scrub_cancel_timeout();
+ scrub_timeout_event = timer.add_event_after(
+ g_conf()->mon_scrub_timeout,
+ new C_MonContext{this, [this](int) {
+ scrub_timeout();
+ }});
+}
+
+/************ TICK ***************/
+void Monitor::new_tick()
+{
+ timer.add_event_after(g_conf()->mon_tick_interval, new C_MonContext{this, [this](int) {
+ tick();
+ }});
+}
+
+void Monitor::tick()
+{
+ // ok go.
+ dout(11) << "tick" << dendl;
+ const utime_t now = ceph_clock_now();
+
+ // Check if we need to emit any delayed health check updated messages
+ if (is_leader()) {
+ const auto min_period = g_conf().get_val<int64_t>(
+ "mon_health_log_update_period");
+ for (auto& svc : paxos_service) {
+ auto health = svc->get_health_checks();
+
+ for (const auto &i : health.checks) {
+ const std::string &code = i.first;
+ const std::string &summary = i.second.summary;
+ const health_status_t severity = i.second.severity;
+
+ auto status_iter = health_check_log_times.find(code);
+ if (status_iter == health_check_log_times.end()) {
+ continue;
+ }
+
+ auto &log_status = status_iter->second;
+ bool const changed = log_status.last_message != summary
+ || log_status.severity != severity;
+
+ if (changed && now - log_status.updated_at > min_period) {
+ log_status.last_message = summary;
+ log_status.updated_at = now;
+ log_status.severity = severity;
+
+ ostringstream ss;
+ ss << "Health check update: " << summary << " (" << code << ")";
+ clog->health(severity) << ss.str();
+ }
+ }
+ }
+ }
+
+
+ for (auto& svc : paxos_service) {
+ svc->tick();
+ svc->maybe_trim();
+ }
+
+ // trim sessions
+ {
+ std::lock_guard l(session_map_lock);
+ auto p = session_map.sessions.begin();
+
+ bool out_for_too_long = (!exited_quorum.is_zero() &&
+ now > (exited_quorum + 2*g_conf()->mon_lease));
+
+ while (!p.end()) {
+ MonSession *s = *p;
+ ++p;
+
+ // don't trim monitors
+ if (s->name.is_mon())
+ continue;
+
+ if (s->session_timeout < now && s->con) {
+ // check keepalive, too
+ s->session_timeout = s->con->get_last_keepalive();
+ s->session_timeout += g_conf()->mon_session_timeout;
+ }
+ if (s->session_timeout < now) {
+ dout(10) << " trimming session " << s->con << " " << s->name
+ << " " << s->addrs
+ << " (timeout " << s->session_timeout
+ << " < now " << now << ")" << dendl;
+ } else if (out_for_too_long) {
+ // boot the client Session because we've taken too long getting back in
+ dout(10) << " trimming session " << s->con << " " << s->name
+ << " because we've been out of quorum too long" << dendl;
+ } else {
+ continue;
+ }
+
+ s->con->mark_down();
+ remove_session(s);
+ logger->inc(l_mon_session_trim);
+ }
+ }
+ sync_trim_providers();
+
+ if (!maybe_wait_for_quorum.empty()) {
+ finish_contexts(g_ceph_context, maybe_wait_for_quorum);
+ }
+
+ if (is_leader() && paxos->is_active() && fingerprint.is_zero()) {
+ // this is only necessary on upgraded clusters.
+ MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
+ prepare_new_fingerprint(t);
+ paxos->trigger_propose();
+ }
+
+ mgr_client.update_daemon_health(get_health_metrics());
+ new_tick();
+}
+
+vector<DaemonHealthMetric> Monitor::get_health_metrics()
+{
+ vector<DaemonHealthMetric> metrics;
+
+ utime_t oldest_secs;
+ const utime_t now = ceph_clock_now();
+ auto too_old = now;
+ too_old -= g_conf().get_val<std::chrono::seconds>("mon_op_complaint_time").count();
+ int slow = 0;
+ TrackedOpRef oldest_op;
+ auto count_slow_ops = [&](TrackedOp& op) {
+ if (op.get_initiated() < too_old) {
+ slow++;
+ if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
+ oldest_op = &op;
+ }
+ return true;
+ } else {
+ return false;
+ }
+ };
+ if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
+ if (slow) {
+ derr << __func__ << " reporting " << slow << " slow ops, oldest is "
+ << oldest_op->get_desc() << dendl;
+ }
+ metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
+ } else {
+ metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
+ }
+ return metrics;
+}
+
+void Monitor::prepare_new_fingerprint(MonitorDBStore::TransactionRef t)
+{
+ uuid_d nf;
+ nf.generate_random();
+ dout(10) << __func__ << " proposing cluster_fingerprint " << nf << dendl;
+
+ bufferlist bl;
+ encode(nf, bl);
+ t->put(MONITOR_NAME, "cluster_fingerprint", bl);
+}
+
+int Monitor::check_fsid()
+{
+ bufferlist ebl;
+ int r = store->get(MONITOR_NAME, "cluster_uuid", ebl);
+ if (r == -ENOENT)
+ return r;
+ ceph_assert(r == 0);
+
+ string es(ebl.c_str(), ebl.length());
+
+ // only keep the first line
+ size_t pos = es.find_first_of('\n');
+ if (pos != string::npos)
+ es.resize(pos);
+
+ dout(10) << "check_fsid cluster_uuid contains '" << es << "'" << dendl;
+ uuid_d ondisk;
+ if (!ondisk.parse(es.c_str())) {
+ derr << "error: unable to parse uuid" << dendl;
+ return -EINVAL;
+ }
+
+ if (monmap->get_fsid() != ondisk) {
+ derr << "error: cluster_uuid file exists with value " << ondisk
+ << ", != our uuid " << monmap->get_fsid() << dendl;
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+int Monitor::write_fsid()
+{
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ write_fsid(t);
+ int r = store->apply_transaction(t);
+ return r;
+}
+
+int Monitor::write_fsid(MonitorDBStore::TransactionRef t)
+{
+ ostringstream ss;
+ ss << monmap->get_fsid() << "\n";
+ string us = ss.str();
+
+ bufferlist b;
+ b.append(us);
+
+ t->put(MONITOR_NAME, "cluster_uuid", b);
+ return 0;
+}
+
+/*
+ * this is the closest thing to a traditional 'mkfs' for ceph.
+ * initialize the monitor state machines to their initial values.
+ */
+int Monitor::mkfs(bufferlist& osdmapbl)
+{
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+
+ // verify cluster fsid
+ int r = check_fsid();
+ if (r < 0 && r != -ENOENT)
+ return r;
+
+ bufferlist magicbl;
+ magicbl.append(CEPH_MON_ONDISK_MAGIC);
+ magicbl.append("\n");
+ t->put(MONITOR_NAME, "magic", magicbl);
+
+
+ features = get_initial_supported_features();
+ write_features(t);
+
+ // save monmap, osdmap, keyring.
+ bufferlist monmapbl;
+ monmap->encode(monmapbl, CEPH_FEATURES_ALL);
+ monmap->set_epoch(0); // must be 0 to avoid confusing first MonmapMonitor::update_from_paxos()
+ t->put("mkfs", "monmap", monmapbl);
+
+ if (osdmapbl.length()) {
+ // make sure it's a valid osdmap
+ try {
+ OSDMap om;
+ om.decode(osdmapbl);
+ }
+ catch (ceph::buffer::error& e) {
+ derr << "error decoding provided osdmap: " << e.what() << dendl;
+ return -EINVAL;
+ }
+ t->put("mkfs", "osdmap", osdmapbl);
+ }
+
+ if (is_keyring_required()) {
+ KeyRing keyring;
+ string keyring_filename;
+
+ r = ceph_resolve_file_search(g_conf()->keyring, keyring_filename);
+ if (r) {
+ if (g_conf()->key != "") {
+ string keyring_plaintext = "[mon.]\n\tkey = " + g_conf()->key +
+ "\n\tcaps mon = \"allow *\"\n";
+ bufferlist bl;
+ bl.append(keyring_plaintext);
+ try {
+ auto i = bl.cbegin();
+ keyring.decode(i);
+ }
+ catch (const ceph::buffer::error& e) {
+ derr << "error decoding keyring " << keyring_plaintext
+ << ": " << e.what() << dendl;
+ return -EINVAL;
+ }
+ } else {
+ derr << "unable to find a keyring on " << g_conf()->keyring
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } else {
+ r = keyring.load(g_ceph_context, keyring_filename);
+ if (r < 0) {
+ derr << "unable to load initial keyring " << g_conf()->keyring << dendl;
+ return r;
+ }
+ }
+
+ // put mon. key in external keyring; seed with everything else.
+ extract_save_mon_key(keyring);
+
+ bufferlist keyringbl;
+ keyring.encode_plaintext(keyringbl);
+ t->put("mkfs", "keyring", keyringbl);
+ }
+ write_fsid(t);
+ store->apply_transaction(t);
+
+ return 0;
+}
+
+int Monitor::write_default_keyring(bufferlist& bl)
+{
+ ostringstream os;
+ os << g_conf()->mon_data << "/keyring";
+
+ int err = 0;
+ int fd = ::open(os.str().c_str(), O_WRONLY|O_CREAT|O_CLOEXEC, 0600);
+ if (fd < 0) {
+ err = -errno;
+ dout(0) << __func__ << " failed to open " << os.str()
+ << ": " << cpp_strerror(err) << dendl;
+ return err;
+ }
+
+ err = bl.write_fd(fd);
+ if (!err)
+ ::fsync(fd);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+
+ return err;
+}
+
+void Monitor::extract_save_mon_key(KeyRing& keyring)
+{
+ EntityName mon_name;
+ mon_name.set_type(CEPH_ENTITY_TYPE_MON);
+ EntityAuth mon_key;
+ if (keyring.get_auth(mon_name, mon_key)) {
+ dout(10) << "extract_save_mon_key moving mon. key to separate keyring" << dendl;
+ KeyRing pkey;
+ pkey.add(mon_name, mon_key);
+ bufferlist bl;
+ pkey.encode_plaintext(bl);
+ write_default_keyring(bl);
+ keyring.remove(mon_name);
+ }
+}
+
+// AuthClient methods -- for mon <-> mon communication
+int Monitor::get_auth_request(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ uint32_t *method,
+ vector<uint32_t> *preferred_modes,
+ bufferlist *out)
+{
+ std::scoped_lock l(auth_lock);
+ if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
+ con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
+ return -EACCES;
+ }
+ AuthAuthorizer *auth;
+ if (!get_authorizer(con->get_peer_type(), &auth)) {
+ return -EACCES;
+ }
+ auth_meta->authorizer.reset(auth);
+ auth_registry.get_supported_modes(con->get_peer_type(),
+ auth->protocol,
+ preferred_modes);
+ *method = auth->protocol;
+ *out = auth->bl;
+ return 0;
+}
+
+int Monitor::handle_auth_reply_more(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ const bufferlist& bl,
+ bufferlist *reply)
+{
+ std::scoped_lock l(auth_lock);
+ if (!auth_meta->authorizer) {
+ derr << __func__ << " no authorizer?" << dendl;
+ return -EACCES;
+ }
+ auth_meta->authorizer->add_challenge(cct, bl);
+ *reply = auth_meta->authorizer->bl;
+ return 0;
+}
+
+int Monitor::handle_auth_done(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ uint64_t global_id,
+ uint32_t con_mode,
+ const bufferlist& bl,
+ CryptoKey *session_key,
+ std::string *connection_secret)
+{
+ std::scoped_lock l(auth_lock);
+ // verify authorizer reply
+ auto p = bl.begin();
+ if (!auth_meta->authorizer->verify_reply(p, connection_secret)) {
+ dout(0) << __func__ << " failed verifying authorizer reply" << dendl;
+ return -EACCES;
+ }
+ auth_meta->session_key = auth_meta->authorizer->session_key;
+ return 0;
+}
+
+int Monitor::handle_auth_bad_method(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ uint32_t old_auth_method,
+ int result,
+ const std::vector<uint32_t>& allowed_methods,
+ const std::vector<uint32_t>& allowed_modes)
+{
+ derr << __func__ << " hmm, they didn't like " << old_auth_method
+ << " result " << cpp_strerror(result) << dendl;
+ return -EACCES;
+}
+
+bool Monitor::get_authorizer(int service_id, AuthAuthorizer **authorizer)
+{
+ dout(10) << "get_authorizer for " << ceph_entity_type_name(service_id)
+ << dendl;
+
+ if (is_shutdown())
+ return false;
+
+ // we only connect to other monitors and mgr; every else connects to us.
+ if (service_id != CEPH_ENTITY_TYPE_MON &&
+ service_id != CEPH_ENTITY_TYPE_MGR)
+ return false;
+
+ if (!auth_cluster_required.is_supported_auth(CEPH_AUTH_CEPHX)) {
+ // auth_none
+ dout(20) << __func__ << " building auth_none authorizer" << dendl;
+ AuthNoneClientHandler handler{g_ceph_context};
+ handler.set_global_id(0);
+ *authorizer = handler.build_authorizer(service_id);
+ return true;
+ }
+
+ CephXServiceTicketInfo auth_ticket_info;
+ CephXSessionAuthInfo info;
+ int ret;
+
+ EntityName name;
+ name.set_type(CEPH_ENTITY_TYPE_MON);
+ auth_ticket_info.ticket.name = name;
+ auth_ticket_info.ticket.global_id = 0;
+
+ if (service_id == CEPH_ENTITY_TYPE_MON) {
+ // mon to mon authentication uses the private monitor shared key and not the
+ // rotating key
+ CryptoKey secret;
+ if (!keyring.get_secret(name, secret) &&
+ !key_server.get_secret(name, secret)) {
+ dout(0) << " couldn't get secret for mon service from keyring or keyserver"
+ << dendl;
+ stringstream ss, ds;
+ int err = key_server.list_secrets(ds);
+ if (err < 0)
+ ss << "no installed auth entries!";
+ else
+ ss << "installed auth entries:";
+ dout(0) << ss.str() << "\n" << ds.str() << dendl;
+ return false;
+ }
+
+ ret = key_server.build_session_auth_info(
+ service_id, auth_ticket_info.ticket, secret, (uint64_t)-1, info);
+ if (ret < 0) {
+ dout(0) << __func__ << " failed to build mon session_auth_info "
+ << cpp_strerror(ret) << dendl;
+ return false;
+ }
+ } else if (service_id == CEPH_ENTITY_TYPE_MGR) {
+ // mgr
+ ret = key_server.build_session_auth_info(
+ service_id, auth_ticket_info.ticket, info);
+ if (ret < 0) {
+ derr << __func__ << " failed to build mgr service session_auth_info "
+ << cpp_strerror(ret) << dendl;
+ return false;
+ }
+ } else {
+ ceph_abort(); // see check at top of fn
+ }
+
+ CephXTicketBlob blob;
+ if (!cephx_build_service_ticket_blob(cct, info, blob)) {
+ dout(0) << "get_authorizer failed to build service ticket" << dendl;
+ return false;
+ }
+ bufferlist ticket_data;
+ encode(blob, ticket_data);
+
+ auto iter = ticket_data.cbegin();
+ CephXTicketHandler handler(g_ceph_context, service_id);
+ decode(handler.ticket, iter);
+
+ handler.session_key = info.session_key;
+
+ *authorizer = handler.build_authorizer(0);
+
+ return true;
+}
+
+int Monitor::handle_auth_request(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ bool more,
+ uint32_t auth_method,
+ const bufferlist &payload,
+ bufferlist *reply)
+{
+ std::scoped_lock l(auth_lock);
+
+ // NOTE: be careful, the Connection hasn't fully negotiated yet, so
+ // e.g., peer_features, peer_addrs, and others are still unknown.
+
+ dout(10) << __func__ << " con " << con << (more ? " (more)":" (start)")
+ << " method " << auth_method
+ << " payload " << payload.length()
+ << dendl;
+ if (!payload.length()) {
+ if (!con->is_msgr2() &&
+ con->get_peer_type() != CEPH_ENTITY_TYPE_MON) {
+ // for v1 connections, we tolerate no authorizer (from
+ // non-monitors), because authentication happens via MAuth
+ // messages.
+ return 1;
+ }
+ return -EACCES;
+ }
+ if (!more) {
+ auth_meta->auth_mode = payload[0];
+ }
+
+ if (auth_meta->auth_mode >= AUTH_MODE_AUTHORIZER &&
+ auth_meta->auth_mode <= AUTH_MODE_AUTHORIZER_MAX) {
+ AuthAuthorizeHandler *ah = get_auth_authorize_handler(con->get_peer_type(),
+ auth_method);
+ if (!ah) {
+ lderr(cct) << __func__ << " no AuthAuthorizeHandler found for auth method "
+ << auth_method << dendl;
+ return -EOPNOTSUPP;
+ }
+ bool was_challenge = (bool)auth_meta->authorizer_challenge;
+ bool isvalid = ah->verify_authorizer(
+ cct,
+ keyring,
+ payload,
+ auth_meta->get_connection_secret_length(),
+ reply,
+ &con->peer_name,
+ &con->peer_global_id,
+ &con->peer_caps_info,
+ &auth_meta->session_key,
+ &auth_meta->connection_secret,
+ &auth_meta->authorizer_challenge);
+ if (isvalid) {
+ ms_handle_fast_authentication(con);
+ return 1;
+ }
+ if (!more && !was_challenge && auth_meta->authorizer_challenge) {
+ return 0;
+ }
+ dout(10) << __func__ << " bad authorizer on " << con << dendl;
+ return -EACCES;
+ } else if (auth_meta->auth_mode < AUTH_MODE_MON ||
+ auth_meta->auth_mode > AUTH_MODE_MON_MAX) {
+ derr << __func__ << " unrecognized auth mode " << auth_meta->auth_mode
+ << dendl;
+ return -EACCES;
+ }
+
+ // wait until we've formed an initial quorum on mkfs so that we have
+ // the initial keys (e.g., client.admin).
+ if (authmon()->get_last_committed() == 0) {
+ dout(10) << __func__ << " haven't formed initial quorum, EBUSY" << dendl;
+ return -EBUSY;
+ }
+
+ RefCountedPtr priv;
+ MonSession *s;
+ int32_t r = 0;
+ auto p = payload.begin();
+ if (!more) {
+ if (con->get_priv()) {
+ return -EACCES; // wtf
+ }
+
+ // handler?
+ unique_ptr<AuthServiceHandler> auth_handler{get_auth_service_handler(
+ auth_method, g_ceph_context, &key_server)};
+ if (!auth_handler) {
+ dout(1) << __func__ << " auth_method " << auth_method << " not supported"
+ << dendl;
+ return -EOPNOTSUPP;
+ }
+
+ uint8_t mode;
+ EntityName entity_name;
+
+ try {
+ decode(mode, p);
+ if (mode < AUTH_MODE_MON ||
+ mode > AUTH_MODE_MON_MAX) {
+ dout(1) << __func__ << " invalid mode " << (int)mode << dendl;
+ return -EACCES;
+ }
+ assert(mode >= AUTH_MODE_MON && mode <= AUTH_MODE_MON_MAX);
+ decode(entity_name, p);
+ decode(con->peer_global_id, p);
+ } catch (ceph::buffer::error& e) {
+ dout(1) << __func__ << " failed to decode, " << e.what() << dendl;
+ return -EACCES;
+ }
+
+ // supported method?
+ if (entity_name.get_type() == CEPH_ENTITY_TYPE_MON ||
+ entity_name.get_type() == CEPH_ENTITY_TYPE_OSD ||
+ entity_name.get_type() == CEPH_ENTITY_TYPE_MDS ||
+ entity_name.get_type() == CEPH_ENTITY_TYPE_MGR) {
+ if (!auth_cluster_required.is_supported_auth(auth_method)) {
+ dout(10) << __func__ << " entity " << entity_name << " method "
+ << auth_method << " not among supported "
+ << auth_cluster_required.get_supported_set() << dendl;
+ return -EOPNOTSUPP;
+ }
+ } else {
+ if (!auth_service_required.is_supported_auth(auth_method)) {
+ dout(10) << __func__ << " entity " << entity_name << " method "
+ << auth_method << " not among supported "
+ << auth_cluster_required.get_supported_set() << dendl;
+ return -EOPNOTSUPP;
+ }
+ }
+
+ // for msgr1 we would do some weirdness here to ensure signatures
+ // are supported by the client if we require it. for msgr2 that
+ // is not necessary.
+
+ bool is_new_global_id = false;
+ if (!con->peer_global_id) {
+ con->peer_global_id = authmon()->_assign_global_id();
+ if (!con->peer_global_id) {
+ dout(1) << __func__ << " failed to assign global_id" << dendl;
+ return -EBUSY;
+ }
+ is_new_global_id = true;
+ }
+
+ // set up partial session
+ s = new MonSession(con);
+ s->auth_handler = auth_handler.release();
+ con->set_priv(RefCountedPtr{s, false});
+
+ r = s->auth_handler->start_session(
+ entity_name,
+ con->peer_global_id,
+ is_new_global_id,
+ reply,
+ &con->peer_caps_info);
+ } else {
+ priv = con->get_priv();
+ if (!priv) {
+ // this can happen if the async ms_handle_reset event races with
+ // the unlocked call into handle_auth_request
+ return -EACCES;
+ }
+ s = static_cast<MonSession*>(priv.get());
+ r = s->auth_handler->handle_request(
+ p,
+ auth_meta->get_connection_secret_length(),
+ reply,
+ &con->peer_caps_info,
+ &auth_meta->session_key,
+ &auth_meta->connection_secret);
+ }
+ if (r > 0 &&
+ !s->authenticated) {
+ ms_handle_fast_authentication(con);
+ }
+
+ dout(30) << " r " << r << " reply:\n";
+ reply->hexdump(*_dout);
+ *_dout << dendl;
+ return r;
+}
+
+void Monitor::ms_handle_accept(Connection *con)
+{
+ auto priv = con->get_priv();
+ MonSession *s = static_cast<MonSession*>(priv.get());
+ if (!s) {
+ // legacy protocol v1?
+ dout(10) << __func__ << " con " << con << " no session" << dendl;
+ return;
+ }
+
+ if (s->item.is_on_list()) {
+ dout(10) << __func__ << " con " << con << " session " << s
+ << " already on list" << dendl;
+ } else {
+ std::lock_guard l(session_map_lock);
+ if (state == STATE_SHUTDOWN) {
+ dout(10) << __func__ << " ignoring new con " << con << " (shutdown)" << dendl;
+ con->mark_down();
+ return;
+ }
+ dout(10) << __func__ << " con " << con << " session " << s
+ << " registering session for "
+ << con->get_peer_addrs() << dendl;
+ s->_ident(entity_name_t(con->get_peer_type(), con->get_peer_id()),
+ con->get_peer_addrs());
+ session_map.add_session(s);
+ }
+}
+
+int Monitor::ms_handle_fast_authentication(Connection *con)
+{
+ if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+ // mon <-> mon connections need no Session, and setting one up
+ // creates an awkward ref cycle between Session and Connection.
+ return 1;
+ }
+
+ auto priv = con->get_priv();
+ MonSession *s = static_cast<MonSession*>(priv.get());
+ if (!s) {
+ // must be msgr2, otherwise dispatch would have set up the session.
+ if (state == STATE_SHUTDOWN) {
+ dout(10) << __func__ << " ignoring new con " << con << " (shutdown)" << dendl;
+ con->mark_down();
+ return -EACCES;
+ }
+ s = session_map.new_session(
+ entity_name_t(con->get_peer_type(), -1), // we don't know yet
+ con->get_peer_addrs(),
+ con);
+ assert(s);
+ dout(10) << __func__ << " adding session " << s << " to con " << con
+ << dendl;
+ con->set_priv(s);
+ logger->set(l_mon_num_sessions, session_map.get_size());
+ logger->inc(l_mon_session_add);
+ }
+ dout(10) << __func__ << " session " << s << " con " << con
+ << " addr " << s->con->get_peer_addr()
+ << " " << *s << dendl;
+
+ AuthCapsInfo &caps_info = con->get_peer_caps_info();
+ int ret = 0;
+ if (caps_info.allow_all) {
+ s->caps.set_allow_all();
+ s->authenticated = true;
+ ret = 1;
+ } else if (caps_info.caps.length()) {
+ bufferlist::const_iterator p = caps_info.caps.cbegin();
+ string str;
+ try {
+ decode(str, p);
+ } catch (const ceph::buffer::error &err) {
+ derr << __func__ << " corrupt cap data for " << con->get_peer_entity_name()
+ << " in auth db" << dendl;
+ str.clear();
+ ret = -EACCES;
+ }
+ if (ret >= 0) {
+ if (s->caps.parse(str, NULL)) {
+ s->authenticated = true;
+ ret = 1;
+ } else {
+ derr << __func__ << " unparseable caps '" << str << "' for "
+ << con->get_peer_entity_name() << dendl;
+ ret = -EACCES;
+ }
+ }
+ }
+
+ return ret;
+}
+
+void Monitor::set_mon_crush_location(const string& loc)
+{
+ if (loc.empty()) {
+ return;
+ }
+ vector<string> loc_vec;
+ loc_vec.push_back(loc);
+ CrushWrapper::parse_loc_map(loc_vec, &crush_loc);
+ need_set_crush_loc = true;
+}
+
+void Monitor::notify_new_monmap(bool can_change_external_state, bool remove_rank_elector)
+{
+ if (need_set_crush_loc) {
+ auto my_info_i = monmap->mon_info.find(name);
+ if (my_info_i != monmap->mon_info.end() &&
+ my_info_i->second.crush_loc == crush_loc) {
+ need_set_crush_loc = false;
+ }
+ }
+ elector.notify_strategy_maybe_changed(monmap->strategy);
+ if (remove_rank_elector){
+ dout(10) << __func__ << " we have " << monmap->ranks.size()<< " ranks" << dendl;
+ dout(10) << __func__ << " we have " << monmap->removed_ranks.size() << " removed ranks" << dendl;
+ for (auto i = monmap->removed_ranks.rbegin();
+ i != monmap->removed_ranks.rend(); ++i) {
+ int remove_rank = *i;
+ dout(10) << __func__ << " removing rank " << remove_rank << dendl;
+ if (rank == remove_rank) {
+ dout(5) << "We are removing our own rank, probably we"
+ << " are removed from monmap before we shutdown ... dropping." << dendl;
+ continue;
+ }
+ int new_rank = monmap->get_rank(messenger->get_myaddrs());
+ if (new_rank == -1) {
+ dout(5) << "We no longer exists in the monmap! ... dropping." << dendl;
+ continue;
+ }
+ elector.notify_rank_removed(remove_rank, new_rank);
+ }
+ }
+
+ if (monmap->stretch_mode_enabled) {
+ try_engage_stretch_mode();
+ }
+
+ if (is_stretch_mode()) {
+ if (!monmap->stretch_marked_down_mons.empty()) {
+ dout(20) << __func__ << " stretch_marked_down_mons: " << monmap->stretch_marked_down_mons << dendl;
+ set_degraded_stretch_mode();
+ }
+ }
+ set_elector_disallowed_leaders(can_change_external_state);
+}
+
+void Monitor::set_elector_disallowed_leaders(bool allow_election)
+{
+ set<int> dl;
+ // inherit dl from monmap
+ for (auto name : monmap->disallowed_leaders) {
+ dl.insert(monmap->get_rank(name));
+ } // unconditionally add stretch_marked_down_mons to the new dl copy
+ for (auto name : monmap->stretch_marked_down_mons) {
+ dl.insert(monmap->get_rank(name));
+ } // add the tiebreaker_mon incase it is not in monmap->disallowed_leaders
+ if (!monmap->tiebreaker_mon.empty() &&
+ monmap->contains(monmap->tiebreaker_mon)) {
+ dl.insert(monmap->get_rank(monmap->tiebreaker_mon));
+ }
+
+ bool disallowed_changed = elector.set_disallowed_leaders(dl);
+ if (disallowed_changed && allow_election) {
+ elector.call_election();
+ }
+}
+
+struct CMonEnableStretchMode : public Context {
+ Monitor *m;
+ CMonEnableStretchMode(Monitor *mon) : m(mon) {}
+ void finish(int r) {
+ m->try_engage_stretch_mode();
+ }
+};
+void Monitor::try_engage_stretch_mode()
+{
+ dout(20) << __func__ << dendl;
+ if (stretch_mode_engaged) return;
+ if (!osdmon()->is_readable()) {
+ dout(20) << "osdmon is not readable" << dendl;
+ osdmon()->wait_for_readable_ctx(new CMonEnableStretchMode(this));
+ return;
+ }
+ if (osdmon()->osdmap.stretch_mode_enabled &&
+ monmap->stretch_mode_enabled) {
+ dout(10) << "Engaging stretch mode!" << dendl;
+ stretch_mode_engaged = true;
+ int32_t stretch_divider_id = osdmon()->osdmap.stretch_mode_bucket;
+ stretch_bucket_divider = osdmon()->osdmap.
+ crush->get_type_name(stretch_divider_id);
+ disconnect_disallowed_stretch_sessions();
+ }
+}
+
+void Monitor::do_stretch_mode_election_work()
+{
+ dout(20) << __func__ << dendl;
+ if (!is_stretch_mode() ||
+ !is_leader()) return;
+ dout(20) << "checking for degraded stretch mode" << dendl;
+ map<string, set<string>> old_dead_buckets;
+ old_dead_buckets.swap(dead_mon_buckets);
+ up_mon_buckets.clear();
+ // identify if we've lost a CRUSH bucket, request OSDMonitor check for death
+ map<string,set<string>> down_mon_buckets;
+ for (unsigned i = 0; i < monmap->size(); ++i) {
+ const auto &mi = monmap->mon_info[monmap->get_name(i)];
+ auto ci = mi.crush_loc.find(stretch_bucket_divider);
+ ceph_assert(ci != mi.crush_loc.end());
+ if (quorum.count(i)) {
+ up_mon_buckets.insert(ci->second);
+ } else {
+ down_mon_buckets[ci->second].insert(mi.name);
+ }
+ }
+ dout(20) << "prior dead_mon_buckets: " << old_dead_buckets
+ << "; down_mon_buckets: " << down_mon_buckets
+ << "; up_mon_buckets: " << up_mon_buckets << dendl;
+ for (const auto& di : down_mon_buckets) {
+ if (!up_mon_buckets.count(di.first)) {
+ dead_mon_buckets[di.first] = di.second;
+ }
+ }
+ dout(20) << "new dead_mon_buckets " << dead_mon_buckets << dendl;
+
+ if (dead_mon_buckets != old_dead_buckets &&
+ dead_mon_buckets.size() >= old_dead_buckets.size()) {
+ maybe_go_degraded_stretch_mode();
+ }
+}
+
+struct CMonGoDegraded : public Context {
+ Monitor *m;
+ CMonGoDegraded(Monitor *mon) : m(mon) {}
+ void finish(int r) {
+ m->maybe_go_degraded_stretch_mode();
+ }
+};
+
+struct CMonGoRecovery : public Context {
+ Monitor *m;
+ CMonGoRecovery(Monitor *mon) : m(mon) {}
+ void finish(int r) {
+ m->go_recovery_stretch_mode();
+ }
+};
+void Monitor::go_recovery_stretch_mode()
+{
+ dout(20) << __func__ << dendl;
+ dout(20) << "is_leader(): " << is_leader() << dendl;
+ if (!is_leader()) return;
+ dout(20) << "is_degraded_stretch_mode(): " << is_degraded_stretch_mode() << dendl;
+ if (!is_degraded_stretch_mode()) return;
+ dout(20) << "is_recovering_stretch_mode(): " << is_recovering_stretch_mode() << dendl;
+ if (is_recovering_stretch_mode()) return;
+ dout(20) << "dead_mon_buckets.size(): " << dead_mon_buckets.size() << dendl;
+ dout(20) << "dead_mon_buckets: " << dead_mon_buckets << dendl;
+ if (dead_mon_buckets.size()) {
+ ceph_assert( 0 == "how did we try and do stretch recovery while we have dead monitor buckets?");
+ // we can't recover if we are missing monitors in a zone!
+ return;
+ }
+
+ if (!osdmon()->is_readable()) {
+ dout(20) << "osdmon is not readable" << dendl;
+ osdmon()->wait_for_readable_ctx(new CMonGoRecovery(this));
+ return;
+ }
+
+ if (!osdmon()->is_writeable()) {
+ dout(20) << "osdmon is not writeable" << dendl;
+ osdmon()->wait_for_writeable_ctx(new CMonGoRecovery(this));
+ return;
+ }
+ osdmon()->trigger_recovery_stretch_mode();
+}
+
+void Monitor::set_recovery_stretch_mode()
+{
+ degraded_stretch_mode = true;
+ recovering_stretch_mode = true;
+ osdmon()->set_recovery_stretch_mode();
+}
+
+void Monitor::maybe_go_degraded_stretch_mode()
+{
+ dout(20) << __func__ << dendl;
+ if (is_degraded_stretch_mode()) return;
+ if (!is_leader()) return;
+ if (dead_mon_buckets.empty()) return;
+ if (!osdmon()->is_readable()) {
+ osdmon()->wait_for_readable_ctx(new CMonGoDegraded(this));
+ return;
+ }
+ ceph_assert(monmap->contains(monmap->tiebreaker_mon));
+ // filter out the tiebreaker zone and check if remaining sites are down by OSDs too
+ const auto &mi = monmap->mon_info[monmap->tiebreaker_mon];
+ auto ci = mi.crush_loc.find(stretch_bucket_divider);
+ map<string, set<string>> filtered_dead_buckets = dead_mon_buckets;
+ filtered_dead_buckets.erase(ci->second);
+
+ set<int> matched_down_buckets;
+ set<string> matched_down_mons;
+ bool dead = osdmon()->check_for_dead_crush_zones(filtered_dead_buckets,
+ &matched_down_buckets,
+ &matched_down_mons);
+ if (dead) {
+ if (!osdmon()->is_writeable()) {
+ dout(20) << "osdmon is not writeable" << dendl;
+ osdmon()->wait_for_writeable_ctx(new CMonGoDegraded(this));
+ return;
+ }
+ if (!monmon()->is_writeable()) {
+ dout(20) << "monmon is not writeable" << dendl;
+ monmon()->wait_for_writeable_ctx(new CMonGoDegraded(this));
+ return;
+ }
+ trigger_degraded_stretch_mode(matched_down_mons, matched_down_buckets);
+ }
+}
+
+void Monitor::trigger_degraded_stretch_mode(const set<string>& dead_mons,
+ const set<int>& dead_buckets)
+{
+ dout(20) << __func__ << dendl;
+ ceph_assert(osdmon()->is_writeable());
+ ceph_assert(monmon()->is_writeable());
+
+ // figure out which OSD zone(s) remains alive by removing
+ // tiebreaker mon from up_mon_buckets
+ set<string> live_zones = up_mon_buckets;
+ ceph_assert(monmap->contains(monmap->tiebreaker_mon));
+ const auto &mi = monmap->mon_info[monmap->tiebreaker_mon];
+ auto ci = mi.crush_loc.find(stretch_bucket_divider);
+ live_zones.erase(ci->second);
+ ceph_assert(live_zones.size() == 1); // only support 2 zones right now
+
+ osdmon()->trigger_degraded_stretch_mode(dead_buckets, live_zones);
+ monmon()->trigger_degraded_stretch_mode(dead_mons);
+ set_degraded_stretch_mode();
+}
+
+void Monitor::set_degraded_stretch_mode()
+{
+ dout(20) << __func__ << dendl;
+ degraded_stretch_mode = true;
+ recovering_stretch_mode = false;
+ osdmon()->set_degraded_stretch_mode();
+}
+
+struct CMonGoHealthy : public Context {
+ Monitor *m;
+ CMonGoHealthy(Monitor *mon) : m(mon) {}
+ void finish(int r) {
+ m->trigger_healthy_stretch_mode();
+ }
+};
+
+
+void Monitor::trigger_healthy_stretch_mode()
+{
+ dout(20) << __func__ << dendl;
+ if (!is_degraded_stretch_mode()) return;
+ if (!is_leader()) return;
+ if (!osdmon()->is_writeable()) {
+ dout(20) << "osdmon is not writeable" << dendl;
+ osdmon()->wait_for_writeable_ctx(new CMonGoHealthy(this));
+ return;
+ }
+ if (!monmon()->is_writeable()) {
+ dout(20) << "monmon is not writeable" << dendl;
+ monmon()->wait_for_writeable_ctx(new CMonGoHealthy(this));
+ return;
+ }
+
+ ceph_assert(osdmon()->osdmap.recovering_stretch_mode);
+ osdmon()->trigger_healthy_stretch_mode();
+ monmon()->trigger_healthy_stretch_mode();
+}
+
+void Monitor::set_healthy_stretch_mode()
+{
+ degraded_stretch_mode = false;
+ recovering_stretch_mode = false;
+ osdmon()->set_healthy_stretch_mode();
+}
+
+bool Monitor::session_stretch_allowed(MonSession *s, MonOpRequestRef& op)
+{
+ if (!is_stretch_mode()) return true;
+ if (s->proxy_con) return true;
+ if (s->validated_stretch_connection) return true;
+ if (!s->con) return true;
+ if (s->con->peer_is_osd()) {
+ dout(20) << __func__ << "checking OSD session" << s << dendl;
+ // okay, check the crush location
+ int barrier_id = [&] {
+ auto type_id = osdmon()->osdmap.crush->get_validated_type_id(
+ stretch_bucket_divider);
+ ceph_assert(type_id.has_value());
+ return *type_id;
+ }();
+ int osd_bucket_id = osdmon()->osdmap.crush->get_parent_of_type(s->con->peer_id,
+ barrier_id);
+ const auto &mi = monmap->mon_info.find(name);
+ ceph_assert(mi != monmap->mon_info.end());
+ auto ci = mi->second.crush_loc.find(stretch_bucket_divider);
+ ceph_assert(ci != mi->second.crush_loc.end());
+ int mon_bucket_id = osdmon()->osdmap.crush->get_item_id(ci->second);
+
+ if (osd_bucket_id != mon_bucket_id) {
+ dout(5) << "discarding session " << *s
+ << " and sending OSD to matched zone" << dendl;
+ s->con->mark_down();
+ std::lock_guard l(session_map_lock);
+ remove_session(s);
+ if (op) {
+ op->mark_zap();
+ }
+ return false;
+ }
+ }
+
+ s->validated_stretch_connection = true;
+ return true;
+}
+
+void Monitor::disconnect_disallowed_stretch_sessions()
+{
+ dout(20) << __func__ << dendl;
+ MonOpRequestRef blank;
+ auto i = session_map.sessions.begin();
+ while (i != session_map.sessions.end()) {
+ auto j = i;
+ ++i;
+ session_stretch_allowed(*j, blank);
+ }
+}
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
new file mode 100644
index 000000000..13afacafd
--- /dev/null
+++ b/src/mon/Monitor.h
@@ -0,0 +1,1163 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/*
+ * This is the top level monitor. It runs on each machine in the Monitor
+ * Cluster. The election of a leader for the paxos algorithm only happens
+ * once per machine via the elector. There is a separate paxos instance (state)
+ * kept for each of the system components: Object Store Device (OSD) Monitor,
+ * Placement Group (PG) Monitor, Metadata Server (MDS) Monitor, and Client Monitor.
+ */
+
+#ifndef CEPH_MONITOR_H
+#define CEPH_MONITOR_H
+
+#include <errno.h>
+#include <cmath>
+#include <string>
+#include <array>
+
+#include "include/types.h"
+#include "include/health.h"
+#include "msg/Messenger.h"
+
+#include "common/Timer.h"
+
+#include "health_check.h"
+#include "MonMap.h"
+#include "Elector.h"
+#include "Paxos.h"
+#include "Session.h"
+#include "MonCommand.h"
+
+
+#include "common/config_obs.h"
+#include "common/LogClient.h"
+#include "auth/AuthClient.h"
+#include "auth/AuthServer.h"
+#include "auth/cephx/CephxKeyServer.h"
+#include "auth/AuthMethodList.h"
+#include "auth/KeyRing.h"
+#include "include/common_fwd.h"
+#include "messages/MMonCommand.h"
+#include "mon/MonitorDBStore.h"
+#include "mgr/MgrClient.h"
+
+#include "mon/MonOpRequest.h"
+#include "common/WorkQueue.h"
+
+using namespace TOPNSPC::common;
+
+#define CEPH_MON_PROTOCOL 13 /* cluster internal */
+
+
+enum {
+ l_cluster_first = 555000,
+ l_cluster_num_mon,
+ l_cluster_num_mon_quorum,
+ l_cluster_num_osd,
+ l_cluster_num_osd_up,
+ l_cluster_num_osd_in,
+ l_cluster_osd_epoch,
+ l_cluster_osd_bytes,
+ l_cluster_osd_bytes_used,
+ l_cluster_osd_bytes_avail,
+ l_cluster_num_pool,
+ l_cluster_num_pg,
+ l_cluster_num_pg_active_clean,
+ l_cluster_num_pg_active,
+ l_cluster_num_pg_peering,
+ l_cluster_num_object,
+ l_cluster_num_object_degraded,
+ l_cluster_num_object_misplaced,
+ l_cluster_num_object_unfound,
+ l_cluster_num_bytes,
+ l_cluster_last,
+};
+
+enum {
+ l_mon_first = 456000,
+ l_mon_num_sessions,
+ l_mon_session_add,
+ l_mon_session_rm,
+ l_mon_session_trim,
+ l_mon_num_elections,
+ l_mon_election_call,
+ l_mon_election_win,
+ l_mon_election_lose,
+ l_mon_last,
+};
+
+class PaxosService;
+
+class AdminSocketHook;
+
+#define COMPAT_SET_LOC "feature_set"
+
+class Monitor : public Dispatcher,
+ public AuthClient,
+ public AuthServer,
+ public md_config_obs_t {
+public:
+ int orig_argc = 0;
+ const char **orig_argv = nullptr;
+
+ // me
+ std::string name;
+ int rank;
+ Messenger *messenger;
+ ConnectionRef con_self;
+ ceph::mutex lock = ceph::make_mutex("Monitor::lock");
+ SafeTimer timer;
+ Finisher finisher;
+ ThreadPool cpu_tp; ///< threadpool for CPU intensive work
+
+ ceph::mutex auth_lock = ceph::make_mutex("Monitor::auth_lock");
+
+ /// true if we have ever joined a quorum. if false, we are either a
+ /// new cluster, a newly joining monitor, or a just-upgraded
+ /// monitor.
+ bool has_ever_joined;
+
+ PerfCounters *logger, *cluster_logger;
+ bool cluster_logger_registered;
+
+ void register_cluster_logger();
+ void unregister_cluster_logger();
+
+ MonMap *monmap;
+ uuid_d fingerprint;
+
+ std::set<entity_addrvec_t> extra_probe_peers;
+
+ LogClient log_client;
+ LogChannelRef clog;
+ LogChannelRef audit_clog;
+ KeyRing keyring;
+ KeyServer key_server;
+
+ AuthMethodList auth_cluster_required;
+ AuthMethodList auth_service_required;
+
+ CompatSet features;
+
+ std::vector<MonCommand> leader_mon_commands; // quorum leader's commands
+ std::vector<MonCommand> local_mon_commands; // commands i support
+ ceph::buffer::list local_mon_commands_bl; // encoded version of above
+
+ std::vector<MonCommand> prenautilus_local_mon_commands;
+ ceph::buffer::list prenautilus_local_mon_commands_bl;
+
+ Messenger *mgr_messenger;
+ MgrClient mgr_client;
+ uint64_t mgr_proxy_bytes = 0; // in-flight proxied mgr command message bytes
+ std::string gss_ktfile_client{};
+
+private:
+ void new_tick();
+
+ // -- local storage --
+public:
+ MonitorDBStore *store;
+ static const std::string MONITOR_NAME;
+ static const std::string MONITOR_STORE_PREFIX;
+
+ // -- monitor state --
+private:
+ enum {
+ STATE_INIT = 1,
+ STATE_PROBING,
+ STATE_SYNCHRONIZING,
+ STATE_ELECTING,
+ STATE_LEADER,
+ STATE_PEON,
+ STATE_SHUTDOWN
+ };
+ int state = STATE_INIT;
+
+public:
+ static const char *get_state_name(int s) {
+ switch (s) {
+ case STATE_PROBING: return "probing";
+ case STATE_SYNCHRONIZING: return "synchronizing";
+ case STATE_ELECTING: return "electing";
+ case STATE_LEADER: return "leader";
+ case STATE_PEON: return "peon";
+ case STATE_SHUTDOWN: return "shutdown";
+ default: return "???";
+ }
+ }
+ const char *get_state_name() const {
+ return get_state_name(state);
+ }
+
+ bool is_init() const { return state == STATE_INIT; }
+ bool is_shutdown() const { return state == STATE_SHUTDOWN; }
+ bool is_probing() const { return state == STATE_PROBING; }
+ bool is_synchronizing() const { return state == STATE_SYNCHRONIZING; }
+ bool is_electing() const { return state == STATE_ELECTING; }
+ bool is_leader() const { return state == STATE_LEADER; }
+ bool is_peon() const { return state == STATE_PEON; }
+
+ const utime_t &get_leader_since() const;
+
+ void prepare_new_fingerprint(MonitorDBStore::TransactionRef t);
+
+ std::vector<DaemonHealthMetric> get_health_metrics();
+
+ int quorum_age() const {
+ auto age = std::chrono::duration_cast<std::chrono::seconds>(
+ ceph::mono_clock::now() - quorum_since);
+ return age.count();
+ }
+
+ bool is_mon_down() const {
+ int max = monmap->size();
+ int actual = get_quorum().size();
+ auto now = ceph::real_clock::now();
+ return actual < max && now > monmap->created.to_real_time();
+ }
+
+ // -- elector --
+private:
+ std::unique_ptr<Paxos> paxos;
+ Elector elector;
+ friend class Elector;
+
+ /// features we require of peers (based on on-disk compatset)
+ uint64_t required_features;
+
+ int leader; // current leader (to best of knowledge)
+ std::set<int> quorum; // current active set of monitors (if !starting)
+ ceph::mono_clock::time_point quorum_since; // when quorum formed
+ utime_t leader_since; // when this monitor became the leader, if it is the leader
+ utime_t exited_quorum; // time detected as not in quorum; 0 if in
+
+ // map of counts of connected clients, by type and features, for
+ // each quorum mon
+ std::map<int,FeatureMap> quorum_feature_map;
+
+ /**
+ * Intersection of quorum member's connection feature bits.
+ */
+ uint64_t quorum_con_features;
+ /**
+ * Intersection of quorum members mon-specific feature bits
+ */
+ mon_feature_t quorum_mon_features;
+
+ ceph_release_t quorum_min_mon_release{ceph_release_t::unknown};
+
+ std::set<std::string> outside_quorum;
+
+ bool stretch_mode_engaged{false};
+ bool degraded_stretch_mode{false};
+ bool recovering_stretch_mode{false};
+ std::string stretch_bucket_divider;
+ std::map<std::string, std::set<std::string>> dead_mon_buckets; // bucket->mon ranks, locations with no live mons
+ std::set<std::string> up_mon_buckets; // locations with a live mon
+ void do_stretch_mode_election_work();
+
+ bool session_stretch_allowed(MonSession *s, MonOpRequestRef& op);
+ void disconnect_disallowed_stretch_sessions();
+ void set_elector_disallowed_leaders(bool allow_election);
+
+ std::map<std::string,std::string> crush_loc;
+ bool need_set_crush_loc{false};
+public:
+ bool is_stretch_mode() { return stretch_mode_engaged; }
+ bool is_degraded_stretch_mode() { return degraded_stretch_mode; }
+ bool is_recovering_stretch_mode() { return recovering_stretch_mode; }
+
+ /**
+ * This set of functions maintains the in-memory stretch state
+ * and sets up transitions of the map states by calling in to
+ * MonmapMonitor and OSDMonitor.
+ *
+ * The [maybe_]go_* functions are called on the leader to
+ * decide if transitions should happen; the trigger_* functions
+ * set up the map transitions; and the set_* functions actually
+ * change the memory state -- but these are only called
+ * via OSDMonitor::update_from_paxos, to guarantee consistent
+ * updates across the entire cluster.
+ */
+ void try_engage_stretch_mode();
+ void maybe_go_degraded_stretch_mode();
+ void trigger_degraded_stretch_mode(const std::set<std::string>& dead_mons,
+ const std::set<int>& dead_buckets);
+ void set_degraded_stretch_mode();
+ void go_recovery_stretch_mode();
+ void set_recovery_stretch_mode();
+ void trigger_healthy_stretch_mode();
+ void set_healthy_stretch_mode();
+ void enable_stretch_mode();
+ void set_mon_crush_location(const std::string& loc);
+
+
+private:
+
+ /**
+ * @defgroup Monitor_h_scrub
+ * @{
+ */
+ version_t scrub_version; ///< paxos version we are scrubbing
+ std::map<int,ScrubResult> scrub_result; ///< results so far
+
+ /**
+ * trigger a cross-mon scrub
+ *
+ * Verify all mons are storing identical content
+ */
+ int scrub_start();
+ int scrub();
+ void handle_scrub(MonOpRequestRef op);
+ bool _scrub(ScrubResult *r,
+ std::pair<std::string,std::string> *start,
+ int *num_keys);
+ void scrub_check_results();
+ void scrub_timeout();
+ void scrub_finish();
+ void scrub_reset();
+ void scrub_update_interval(ceph::timespan interval);
+
+ Context *scrub_event; ///< periodic event to trigger scrub (leader)
+ Context *scrub_timeout_event; ///< scrub round timeout (leader)
+ void scrub_event_start();
+ void scrub_event_cancel();
+ void scrub_reset_timeout();
+ void scrub_cancel_timeout();
+
+ struct ScrubState {
+ std::pair<std::string,std::string> last_key; ///< last scrubbed key
+ bool finished;
+
+ ScrubState() : finished(false) { }
+ virtual ~ScrubState() { }
+ };
+ std::shared_ptr<ScrubState> scrub_state; ///< keeps track of current scrub
+
+ /**
+ * @defgroup Monitor_h_sync Synchronization
+ * @{
+ */
+ /**
+ * @} // provider state
+ */
+ struct SyncProvider {
+ entity_addrvec_t addrs;
+ uint64_t cookie; ///< unique cookie for this sync attempt
+ utime_t timeout; ///< when we give up and expire this attempt
+ version_t last_committed; ///< last paxos version on peer
+ std::pair<std::string,std::string> last_key; ///< last key sent to (or on) peer
+ bool full; ///< full scan?
+ MonitorDBStore::Synchronizer synchronizer; ///< iterator
+
+ SyncProvider() : cookie(0), last_committed(0), full(false) {}
+
+ void reset_timeout(CephContext *cct, int grace) {
+ timeout = ceph_clock_now();
+ timeout += grace;
+ }
+ };
+
+ std::map<std::uint64_t, SyncProvider> sync_providers; ///< cookie -> SyncProvider for those syncing from us
+ uint64_t sync_provider_count; ///< counter for issued cookies to keep them unique
+
+ /**
+ * @} // requester state
+ */
+ entity_addrvec_t sync_provider; ///< who we are syncing from
+ uint64_t sync_cookie; ///< 0 if we are starting, non-zero otherwise
+ bool sync_full; ///< true if we are a full sync, false for recent catch-up
+ version_t sync_start_version; ///< last_committed at sync start
+ Context *sync_timeout_event; ///< timeout event
+
+ /**
+ * floor for sync source
+ *
+ * When we sync we forget about our old last_committed value which
+ * can be dangerous. For example, if we have a cluster of:
+ *
+ * mon.a: lc 100
+ * mon.b: lc 80
+ * mon.c: lc 100 (us)
+ *
+ * If something forces us to sync (say, corruption, or manual
+ * intervention, or bug), we forget last_committed, and might abort.
+ * If mon.a happens to be down when we come back, we will see:
+ *
+ * mon.b: lc 80
+ * mon.c: lc 0 (us)
+ *
+ * and sync from mon.b, at which point a+b will both have lc 80 and
+ * come online with a majority holding out of date commits.
+ *
+ * Avoid this by preserving our old last_committed value prior to
+ * sync and never going backwards.
+ */
+ version_t sync_last_committed_floor;
+
+ /**
+ * Obtain the synchronization target prefixes in set form.
+ *
+ * We consider a target prefix all those that are relevant when
+ * synchronizing two stores. That is, all those that hold paxos service's
+ * versions, as well as paxos versions, or any control keys such as the
+ * first or last committed version.
+ *
+ * Given the current design, this function should return the name of all and
+ * any available paxos service, plus the paxos name.
+ *
+ * @returns a set of strings referring to the prefixes being synchronized
+ */
+ std::set<std::string> get_sync_targets_names();
+
+ /**
+ * Reset the monitor's sync-related data structures for syncing *from* a peer
+ */
+ void sync_reset_requester();
+
+ /**
+ * Reset sync state related to allowing others to sync from us
+ */
+ void sync_reset_provider();
+
+ /**
+ * Caled when a sync attempt times out (requester-side)
+ */
+ void sync_timeout();
+
+ /**
+ * Get the latest monmap for backup purposes during sync
+ */
+ void sync_obtain_latest_monmap(ceph::buffer::list &bl);
+
+ /**
+ * Start sync process
+ *
+ * Start pulling committed state from another monitor.
+ *
+ * @param entity where to pull committed state from
+ * @param full whether to do a full sync or just catch up on recent paxos
+ */
+ void sync_start(entity_addrvec_t &addrs, bool full);
+
+public:
+ /**
+ * force a sync on next mon restart
+ */
+ void sync_force(ceph::Formatter *f);
+
+private:
+ /**
+ * store critical state for safekeeping during sync
+ *
+ * We store a few things on the side that we don't want to get clobbered by sync. This
+ * includes the latest monmap and a lower bound on last_committed.
+ */
+ void sync_stash_critical_state(MonitorDBStore::TransactionRef tx);
+
+ /**
+ * reset the sync timeout
+ *
+ * This is used on the client to restart if things aren't progressing
+ */
+ void sync_reset_timeout();
+
+ /**
+ * trim stale sync provider state
+ *
+ * If someone is syncing from us and hasn't talked to us recently, expire their state.
+ */
+ void sync_trim_providers();
+
+ /**
+ * Complete a sync
+ *
+ * Finish up a sync after we've gotten all of the chunks.
+ *
+ * @param last_committed final last_committed value from provider
+ */
+ void sync_finish(version_t last_committed);
+
+ /**
+ * request the next chunk from the provider
+ */
+ void sync_get_next_chunk();
+
+ /**
+ * handle sync message
+ *
+ * @param m Sync message with operation type MMonSync::OP_START_CHUNKS
+ */
+ void handle_sync(MonOpRequestRef op);
+
+ void _sync_reply_no_cookie(MonOpRequestRef op);
+
+ void handle_sync_get_cookie(MonOpRequestRef op);
+ void handle_sync_get_chunk(MonOpRequestRef op);
+ void handle_sync_finish(MonOpRequestRef op);
+
+ void handle_sync_cookie(MonOpRequestRef op);
+ void handle_sync_forward(MonOpRequestRef op);
+ void handle_sync_chunk(MonOpRequestRef op);
+ void handle_sync_no_cookie(MonOpRequestRef op);
+
+ /**
+ * @} // Synchronization
+ */
+
+ std::list<Context*> waitfor_quorum;
+ std::list<Context*> maybe_wait_for_quorum;
+
+ /**
+ * @defgroup Monitor_h_TimeCheck Monitor Clock Drift Early Warning System
+ * @{
+ *
+ * We use time checks to keep track of any clock drifting going on in the
+ * cluster. This is accomplished by periodically ping each monitor in the
+ * quorum and register its response time on a map, assessing how much its
+ * clock has drifted. We also take this opportunity to assess the latency
+ * on response.
+ *
+ * This mechanism works as follows:
+ *
+ * - Leader sends out a 'PING' message to each other monitor in the quorum.
+ * The message is timestamped with the leader's current time. The leader's
+ * current time is recorded in a map, associated with each peon's
+ * instance.
+ * - The peon replies to the leader with a timestamped 'PONG' message.
+ * - The leader calculates a delta between the peon's timestamp and its
+ * current time and stashes it.
+ * - The leader also calculates the time it took to receive the 'PONG'
+ * since the 'PING' was sent, and stashes an approximate latency estimate.
+ * - Once all the quorum members have pong'ed, the leader will share the
+ * clock skew and latency maps with all the monitors in the quorum.
+ */
+ std::map<int, utime_t> timecheck_waiting;
+ std::map<int, double> timecheck_skews;
+ std::map<int, double> timecheck_latencies;
+ // odd value means we are mid-round; even value means the round has
+ // finished.
+ version_t timecheck_round;
+ unsigned int timecheck_acks;
+ utime_t timecheck_round_start;
+ friend class HealthMonitor;
+ /* When we hit a skew we will start a new round based off of
+ * 'mon_timecheck_skew_interval'. Each new round will be backed off
+ * until we hit 'mon_timecheck_interval' -- which is the typical
+ * interval when not in the presence of a skew.
+ *
+ * This variable tracks the number of rounds with skews since last clean
+ * so that we can report to the user and properly adjust the backoff.
+ */
+ uint64_t timecheck_rounds_since_clean;
+ /**
+ * Time Check event.
+ */
+ Context *timecheck_event;
+
+ void timecheck_start();
+ void timecheck_finish();
+ void timecheck_start_round();
+ void timecheck_finish_round(bool success = true);
+ void timecheck_cancel_round();
+ void timecheck_cleanup();
+ void timecheck_reset_event();
+ void timecheck_check_skews();
+ void timecheck_report();
+ void timecheck();
+ health_status_t timecheck_status(std::ostringstream &ss,
+ const double skew_bound,
+ const double latency);
+ void handle_timecheck_leader(MonOpRequestRef op);
+ void handle_timecheck_peon(MonOpRequestRef op);
+ void handle_timecheck(MonOpRequestRef op);
+
+ /**
+ * Returns 'true' if this is considered to be a skew; 'false' otherwise.
+ */
+ bool timecheck_has_skew(const double skew_bound, double *abs) const {
+ double abs_skew = std::fabs(skew_bound);
+ if (abs)
+ *abs = abs_skew;
+ return (abs_skew > g_conf()->mon_clock_drift_allowed);
+ }
+
+ /**
+ * @}
+ */
+ /**
+ * Handle ping messages from others.
+ */
+ void handle_ping(MonOpRequestRef op);
+
+ Context *probe_timeout_event = nullptr; // for probing
+
+ void reset_probe_timeout();
+ void cancel_probe_timeout();
+ void probe_timeout(int r);
+
+ void _apply_compatset_features(CompatSet &new_features);
+
+public:
+ epoch_t get_epoch();
+ int get_leader() const { return leader; }
+ std::string get_leader_name() {
+ return quorum.empty() ? std::string() : monmap->get_name(leader);
+ }
+ const std::set<int>& get_quorum() const { return quorum; }
+ std::list<std::string> get_quorum_names() {
+ std::list<std::string> q;
+ for (auto p = quorum.begin(); p != quorum.end(); ++p)
+ q.push_back(monmap->get_name(*p));
+ return q;
+ }
+ uint64_t get_quorum_con_features() const {
+ return quorum_con_features;
+ }
+ mon_feature_t get_quorum_mon_features() const {
+ return quorum_mon_features;
+ }
+ uint64_t get_required_features() const {
+ return required_features;
+ }
+ mon_feature_t get_required_mon_features() const {
+ return monmap->get_required_features();
+ }
+ void apply_quorum_to_compatset_features();
+ void apply_monmap_to_compatset_features();
+ void calc_quorum_requirements();
+
+ void get_combined_feature_map(FeatureMap *fm);
+
+private:
+ void _reset(); ///< called from bootstrap, start_, or join_election
+ void wait_for_paxos_write();
+ void _finish_svc_election(); ///< called by {win,lose}_election
+ void respawn();
+public:
+ void bootstrap();
+ void join_election();
+ void start_election();
+ void win_standalone_election();
+ // end election (called by Elector)
+ void win_election(epoch_t epoch, const std::set<int>& q,
+ uint64_t features,
+ const mon_feature_t& mon_features,
+ ceph_release_t min_mon_release,
+ const std::map<int,Metadata>& metadata);
+ void lose_election(epoch_t epoch, std::set<int>& q, int l,
+ uint64_t features,
+ const mon_feature_t& mon_features,
+ ceph_release_t min_mon_release);
+ // end election (called by Elector)
+ void finish_election();
+
+ void update_logger();
+
+ /**
+ * Vector holding the Services serviced by this Monitor.
+ */
+ std::array<std::unique_ptr<PaxosService>, PAXOS_NUM> paxos_service;
+
+ class MDSMonitor *mdsmon() {
+ return (class MDSMonitor *)paxos_service[PAXOS_MDSMAP].get();
+ }
+
+ class MonmapMonitor *monmon() {
+ return (class MonmapMonitor *)paxos_service[PAXOS_MONMAP].get();
+ }
+
+ class OSDMonitor *osdmon() {
+ return (class OSDMonitor *)paxos_service[PAXOS_OSDMAP].get();
+ }
+
+ class AuthMonitor *authmon() {
+ return (class AuthMonitor *)paxos_service[PAXOS_AUTH].get();
+ }
+
+ class LogMonitor *logmon() {
+ return (class LogMonitor*) paxos_service[PAXOS_LOG].get();
+ }
+
+ class MgrMonitor *mgrmon() {
+ return (class MgrMonitor*) paxos_service[PAXOS_MGR].get();
+ }
+
+ class MgrStatMonitor *mgrstatmon() {
+ return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT].get();
+ }
+
+ class HealthMonitor *healthmon() {
+ return (class HealthMonitor*) paxos_service[PAXOS_HEALTH].get();
+ }
+
+ class ConfigMonitor *configmon() {
+ return (class ConfigMonitor*) paxos_service[PAXOS_CONFIG].get();
+ }
+
+ class KVMonitor *kvmon() {
+ return (class KVMonitor*) paxos_service[PAXOS_KV].get();
+ }
+
+ friend class Paxos;
+ friend class OSDMonitor;
+ friend class MDSMonitor;
+ friend class MonmapMonitor;
+ friend class LogMonitor;
+ friend class KVMonitor;
+
+ // -- sessions --
+ MonSessionMap session_map;
+ ceph::mutex session_map_lock = ceph::make_mutex("Monitor::session_map_lock");
+ AdminSocketHook *admin_hook;
+
+ template<typename Func, typename...Args>
+ void with_session_map(Func&& func) {
+ std::lock_guard l(session_map_lock);
+ std::forward<Func>(func)(session_map);
+ }
+ void send_latest_monmap(Connection *con);
+
+ // messages
+ void handle_get_version(MonOpRequestRef op);
+ void handle_subscribe(MonOpRequestRef op);
+ void handle_mon_get_map(MonOpRequestRef op);
+
+ static void _generate_command_map(cmdmap_t& cmdmap,
+ std::map<std::string,std::string> &param_str_map);
+ static const MonCommand *_get_moncommand(
+ const std::string &cmd_prefix,
+ const std::vector<MonCommand>& cmds);
+ bool _allowed_command(MonSession *s, const std::string& module,
+ const std::string& prefix,
+ const cmdmap_t& cmdmap,
+ const std::map<std::string,std::string>& param_str_map,
+ const MonCommand *this_cmd);
+ void get_mon_status(ceph::Formatter *f);
+ void _quorum_status(ceph::Formatter *f, std::ostream& ss);
+ bool _add_bootstrap_peer_hint(std::string_view cmd, const cmdmap_t& cmdmap,
+ std::ostream& ss);
+ void handle_tell_command(MonOpRequestRef op);
+ void handle_command(MonOpRequestRef op);
+ void handle_route(MonOpRequestRef op);
+
+ int get_mon_metadata(int mon, ceph::Formatter *f, std::ostream& err);
+ int print_nodes(ceph::Formatter *f, std::ostream& err);
+
+ // track metadata reported by win_election()
+ std::map<int, Metadata> mon_metadata;
+ std::map<int, Metadata> pending_metadata;
+
+ /**
+ *
+ */
+ struct health_cache_t {
+ health_status_t overall;
+ std::string summary;
+
+ void reset() {
+ // health_status_t doesn't really have a NONE value and we're not
+ // okay with setting something else (say, HEALTH_ERR). so just
+ // leave it be.
+ summary.clear();
+ }
+ } health_status_cache;
+
+ Context *health_tick_event = nullptr;
+ Context *health_interval_event = nullptr;
+
+ void health_tick_start();
+ void health_tick_stop();
+ ceph::real_clock::time_point health_interval_calc_next_update();
+ void health_interval_start();
+ void health_interval_stop();
+ void health_events_cleanup();
+
+ void health_to_clog_update_conf(const std::set<std::string> &changed);
+
+ void do_health_to_clog_interval();
+ void do_health_to_clog(bool force = false);
+
+ void log_health(
+ const health_check_map_t& updated,
+ const health_check_map_t& previous,
+ MonitorDBStore::TransactionRef t);
+
+ void update_pending_metadata();
+
+protected:
+
+ class HealthCheckLogStatus {
+ public:
+ health_status_t severity;
+ std::string last_message;
+ utime_t updated_at = 0;
+ HealthCheckLogStatus(health_status_t severity_,
+ const std::string &last_message_,
+ utime_t updated_at_)
+ : severity(severity_),
+ last_message(last_message_),
+ updated_at(updated_at_)
+ {}
+ };
+ std::map<std::string, HealthCheckLogStatus> health_check_log_times;
+
+public:
+
+ void get_cluster_status(std::stringstream &ss, ceph::Formatter *f,
+ MonSession *session);
+
+ void reply_command(MonOpRequestRef op, int rc, const std::string &rs, version_t version);
+ void reply_command(MonOpRequestRef op, int rc, const std::string &rs, ceph::buffer::list& rdata, version_t version);
+
+ void reply_tell_command(MonOpRequestRef op, int rc, const std::string &rs);
+
+
+
+ void handle_probe(MonOpRequestRef op);
+ /**
+ * Handle a Probe Operation, replying with our name, quorum and known versions.
+ *
+ * We use the MMonProbe message class for anything and everything related with
+ * Monitor probing. One of the operations relates directly with the probing
+ * itself, in which we receive a probe request and to which we reply with
+ * our name, our quorum and the known versions for each Paxos service. Thus the
+ * redundant function name. This reply will obviously be sent to the one
+ * probing/requesting these infos.
+ *
+ * @todo Add @pre and @post
+ *
+ * @param m A Probe message, with an operation of type Probe.
+ */
+ void handle_probe_probe(MonOpRequestRef op);
+ void handle_probe_reply(MonOpRequestRef op);
+
+ // request routing
+ struct RoutedRequest {
+ uint64_t tid;
+ ceph::buffer::list request_bl;
+ MonSession *session;
+ ConnectionRef con;
+ uint64_t con_features;
+ MonOpRequestRef op;
+
+ RoutedRequest() : tid(0), session(NULL), con_features(0) {}
+ ~RoutedRequest() {
+ if (session)
+ session->put();
+ }
+ };
+ uint64_t routed_request_tid;
+ std::map<uint64_t, RoutedRequest*> routed_requests;
+
+ void forward_request_leader(MonOpRequestRef op);
+ void handle_forward(MonOpRequestRef op);
+ void send_reply(MonOpRequestRef op, Message *reply);
+ void no_reply(MonOpRequestRef op);
+ void resend_routed_requests();
+ void remove_session(MonSession *s);
+ void remove_all_sessions();
+ void waitlist_or_zap_client(MonOpRequestRef op);
+
+ void send_mon_message(Message *m, int rank);
+ /** can_change_external_state if we can do things like
+ * call elections as a result of the new map.
+ */
+ void notify_new_monmap(bool can_change_external_state=false, bool remove_rank_elector=true);
+
+public:
+ struct C_Command : public C_MonOp {
+ Monitor &mon;
+ int rc;
+ std::string rs;
+ ceph::buffer::list rdata;
+ version_t version;
+ C_Command(Monitor &_mm, MonOpRequestRef _op, int r, std::string s, version_t v) :
+ C_MonOp(_op), mon(_mm), rc(r), rs(s), version(v){}
+ C_Command(Monitor &_mm, MonOpRequestRef _op, int r, std::string s, ceph::buffer::list rd, version_t v) :
+ C_MonOp(_op), mon(_mm), rc(r), rs(s), rdata(rd), version(v){}
+
+ void _finish(int r) override {
+ auto m = op->get_req<MMonCommand>();
+ if (r >= 0) {
+ std::ostringstream ss;
+ if (!op->get_req()->get_connection()) {
+ ss << "connection dropped for command ";
+ } else {
+ MonSession *s = op->get_session();
+
+ // if client drops we may not have a session to draw information from.
+ if (s) {
+ ss << "from='" << s->name << " " << s->addrs << "' "
+ << "entity='" << s->entity_name << "' ";
+ } else {
+ ss << "session dropped for command ";
+ }
+ }
+ cmdmap_t cmdmap;
+ std::ostringstream ds;
+ std::string prefix;
+ cmdmap_from_json(m->cmd, &cmdmap, ds);
+ cmd_getval(cmdmap, "prefix", prefix);
+ if (prefix != "config set" && prefix != "config-key set")
+ ss << "cmd='" << m->cmd << "': finished";
+
+ mon.audit_clog->info() << ss.str();
+ mon.reply_command(op, rc, rs, rdata, version);
+ }
+ else if (r == -ECANCELED)
+ return;
+ else if (r == -EAGAIN)
+ mon.dispatch_op(op);
+ else
+ ceph_abort_msg("bad C_Command return value");
+ }
+ };
+
+ private:
+ class C_RetryMessage : public C_MonOp {
+ Monitor *mon;
+ public:
+ C_RetryMessage(Monitor *m, MonOpRequestRef op) :
+ C_MonOp(op), mon(m) { }
+
+ void _finish(int r) override {
+ if (r == -EAGAIN || r >= 0)
+ mon->dispatch_op(op);
+ else if (r == -ECANCELED)
+ return;
+ else
+ ceph_abort_msg("bad C_RetryMessage return value");
+ }
+ };
+
+ //ms_dispatch handles a lot of logic and we want to reuse it
+ //on forwarded messages, so we create a non-locking version for this class
+ void _ms_dispatch(Message *m);
+ bool ms_dispatch(Message *m) override {
+ std::lock_guard l{lock};
+ _ms_dispatch(m);
+ return true;
+ }
+ void dispatch_op(MonOpRequestRef op);
+ //mon_caps is used for un-connected messages from monitors
+ MonCap mon_caps;
+ bool get_authorizer(int dest_type, AuthAuthorizer **authorizer);
+public: // for AuthMonitor msgr1:
+ int ms_handle_fast_authentication(Connection *con) override;
+private:
+ void ms_handle_accept(Connection *con) override;
+ bool ms_handle_reset(Connection *con) override;
+ void ms_handle_remote_reset(Connection *con) override {}
+ bool ms_handle_refused(Connection *con) override;
+
+ // AuthClient
+ int get_auth_request(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ uint32_t *method,
+ std::vector<uint32_t> *preferred_modes,
+ ceph::buffer::list *out) override;
+ int handle_auth_reply_more(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ const ceph::buffer::list& bl,
+ ceph::buffer::list *reply) override;
+ int handle_auth_done(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ uint64_t global_id,
+ uint32_t con_mode,
+ const ceph::buffer::list& bl,
+ CryptoKey *session_key,
+ std::string *connection_secret) override;
+ int handle_auth_bad_method(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ uint32_t old_auth_method,
+ int result,
+ const std::vector<uint32_t>& allowed_methods,
+ const std::vector<uint32_t>& allowed_modes) override;
+ // /AuthClient
+ // AuthServer
+ int handle_auth_request(
+ Connection *con,
+ AuthConnectionMeta *auth_meta,
+ bool more,
+ uint32_t auth_method,
+ const ceph::buffer::list& bl,
+ ceph::buffer::list *reply) override;
+ // /AuthServer
+
+ int write_default_keyring(ceph::buffer::list& bl);
+ void extract_save_mon_key(KeyRing& keyring);
+
+ void collect_metadata(Metadata *m);
+ int load_metadata();
+ void count_metadata(const std::string& field, ceph::Formatter *f);
+ void count_metadata(const std::string& field, std::map<std::string,int> *out);
+ // get_all_versions() gathers version information from daemons for health check
+ void get_all_versions(std::map<std::string, std::list<std::string>> &versions);
+ void get_versions(std::map<std::string, std::list<std::string>> &versions);
+
+ // features
+ static CompatSet get_initial_supported_features();
+ static CompatSet get_supported_features();
+ static CompatSet get_legacy_features();
+ /// read the ondisk features into the CompatSet pointed to by read_features
+ static void read_features_off_disk(MonitorDBStore *store, CompatSet *read_features);
+ void read_features();
+ void write_features(MonitorDBStore::TransactionRef t);
+
+ OpTracker op_tracker;
+
+ public:
+ Monitor(CephContext *cct_, std::string nm, MonitorDBStore *s,
+ Messenger *m, Messenger *mgr_m, MonMap *map);
+ ~Monitor() override;
+
+ static int check_features(MonitorDBStore *store);
+
+ // config observer
+ const char** get_tracked_conf_keys() const override;
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed) override;
+
+ void update_log_clients();
+ int sanitize_options();
+ int preinit();
+ int init();
+ void init_paxos();
+ void refresh_from_paxos(bool *need_bootstrap);
+ void shutdown();
+ void tick();
+
+ void handle_signal(int sig);
+
+ int mkfs(ceph::buffer::list& osdmapbl);
+
+ /**
+ * check cluster_fsid file
+ *
+ * @return EEXIST if file exists and doesn't match, 0 on match, or negative error code
+ */
+ int check_fsid();
+
+ /**
+ * write cluster_fsid file
+ *
+ * @return 0 on success, or negative error code
+ */
+ int write_fsid();
+ int write_fsid(MonitorDBStore::TransactionRef t);
+
+ int do_admin_command(std::string_view command, const cmdmap_t& cmdmap,
+ ceph::Formatter *f,
+ std::ostream& err,
+ std::ostream& out);
+
+private:
+ // don't allow copying
+ Monitor(const Monitor& rhs);
+ Monitor& operator=(const Monitor &rhs);
+
+public:
+ static void format_command_descriptions(const std::vector<MonCommand> &commands,
+ ceph::Formatter *f,
+ uint64_t features,
+ ceph::buffer::list *rdata);
+
+ const std::vector<MonCommand> &get_local_commands(mon_feature_t f) {
+ if (f.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) {
+ return local_mon_commands;
+ } else {
+ return prenautilus_local_mon_commands;
+ }
+ }
+ const ceph::buffer::list& get_local_commands_bl(mon_feature_t f) {
+ if (f.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) {
+ return local_mon_commands_bl;
+ } else {
+ return prenautilus_local_mon_commands_bl;
+ }
+ }
+ void set_leader_commands(const std::vector<MonCommand>& cmds) {
+ leader_mon_commands = cmds;
+ }
+
+ bool is_keyring_required();
+
+public:
+ ceph::coarse_mono_time get_starttime() const {
+ return starttime;
+ }
+ std::chrono::milliseconds get_uptime() const {
+ auto now = ceph::coarse_mono_clock::now();
+ return std::chrono::duration_cast<std::chrono::milliseconds>(now-starttime);
+ }
+
+private:
+ ceph::coarse_mono_time const starttime = coarse_mono_clock::now();
+};
+
+#define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)")
+#define CEPH_MON_FEATURE_INCOMPAT_GV CompatSet::Feature (2, "global version sequencing (v0.52)")
+#define CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS CompatSet::Feature (3, "single paxos with k/v store (v0.\?)")
+#define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools")
+#define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding")
+#define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code")
+#define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code")
+#define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features")
+#define CEPH_MON_FEATURE_INCOMPAT_LUMINOUS CompatSet::Feature(9, "luminous ondisk layout")
+#define CEPH_MON_FEATURE_INCOMPAT_MIMIC CompatSet::Feature(10, "mimic ondisk layout")
+#define CEPH_MON_FEATURE_INCOMPAT_NAUTILUS CompatSet::Feature(11, "nautilus ondisk layout")
+#define CEPH_MON_FEATURE_INCOMPAT_OCTOPUS CompatSet::Feature(12, "octopus ondisk layout")
+#define CEPH_MON_FEATURE_INCOMPAT_PACIFIC CompatSet::Feature(13, "pacific ondisk layout")
+#define CEPH_MON_FEATURE_INCOMPAT_QUINCY CompatSet::Feature(14, "quincy ondisk layout")
+#define CEPH_MON_FEATURE_INCOMPAT_REEF CompatSet::Feature(15, "reef ondisk layout")
+#define CEPH_MON_FEATURE_INCOMPAT_SQUID CompatSet::Feature(16, "squid ondisk layout")
+// make sure you add your feature to Monitor::get_supported_features
+
+
+/* Callers use:
+ *
+ * new C_MonContext{...}
+ *
+ * instead of
+ *
+ * new C_MonContext(...)
+ *
+ * because of gcc bug [1].
+ *
+ * [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85883
+ */
+template<typename T>
+class C_MonContext : public LambdaContext<T> {
+public:
+ C_MonContext(const Monitor* m, T&& f) :
+ LambdaContext<T>(std::forward<T>(f)),
+ mon(m)
+ {}
+ void finish(int r) override {
+ if (mon->is_shutdown())
+ return;
+ LambdaContext<T>::finish(r);
+ }
+private:
+ const Monitor* mon;
+};
+
+#endif
diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h
new file mode 100644
index 000000000..b7dfc50b0
--- /dev/null
+++ b/src/mon/MonitorDBStore.h
@@ -0,0 +1,823 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+* Ceph - scalable distributed file system
+*
+* Copyright (C) 2012 Inktank, Inc.
+*
+* This is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License version 2.1, as published by the Free Software
+* Foundation. See file COPYING.
+*/
+#ifndef CEPH_MONITOR_DB_STORE_H
+#define CEPH_MONITOR_DB_STORE_H
+
+#include "include/types.h"
+#include "include/buffer.h"
+#include <set>
+#include <map>
+#include <string>
+#include <boost/scoped_ptr.hpp>
+#include <sstream>
+#include <fstream>
+#include "kv/KeyValueDB.h"
+
+#include "include/ceph_assert.h"
+#include "common/Formatter.h"
+#include "common/Finisher.h"
+#include "common/errno.h"
+#include "common/debug.h"
+#include "common/safe_io.h"
+#include "common/blkdev.h"
+#include "common/PriorityCache.h"
+
+#define dout_context g_ceph_context
+
+class MonitorDBStore
+{
+ std::string path;
+ boost::scoped_ptr<KeyValueDB> db;
+ bool do_dump;
+ int dump_fd_binary;
+ std::ofstream dump_fd_json;
+ ceph::JSONFormatter dump_fmt;
+
+
+ Finisher io_work;
+
+ bool is_open;
+
+ public:
+
+ std::string get_devname() {
+ char devname[4096] = {0}, partition[4096];
+ get_device_by_path(path.c_str(), partition, devname,
+ sizeof(devname));
+ return devname;
+ }
+
+ std::string get_path() {
+ return path;
+ }
+
+ std::shared_ptr<PriorityCache::PriCache> get_priority_cache() const {
+ return db->get_priority_cache();
+ }
+
+ struct Op {
+ uint8_t type;
+ std::string prefix;
+ std::string key, endkey;
+ ceph::buffer::list bl;
+
+ Op()
+ : type(0) { }
+ Op(int t, const std::string& p, const std::string& k)
+ : type(t), prefix(p), key(k) { }
+ Op(int t, const std::string& p, const std::string& k, const ceph::buffer::list& b)
+ : type(t), prefix(p), key(k), bl(b) { }
+ Op(int t, const std::string& p, const std::string& start, const std::string& end)
+ : type(t), prefix(p), key(start), endkey(end) { }
+
+ void encode(ceph::buffer::list& encode_bl) const {
+ ENCODE_START(2, 1, encode_bl);
+ encode(type, encode_bl);
+ encode(prefix, encode_bl);
+ encode(key, encode_bl);
+ encode(bl, encode_bl);
+ encode(endkey, encode_bl);
+ ENCODE_FINISH(encode_bl);
+ }
+
+ void decode(ceph::buffer::list::const_iterator& decode_bl) {
+ DECODE_START(2, decode_bl);
+ decode(type, decode_bl);
+ decode(prefix, decode_bl);
+ decode(key, decode_bl);
+ decode(bl, decode_bl);
+ if (struct_v >= 2)
+ decode(endkey, decode_bl);
+ DECODE_FINISH(decode_bl);
+ }
+
+ void dump(ceph::Formatter *f) const {
+ f->dump_int("type", type);
+ f->dump_string("prefix", prefix);
+ f->dump_string("key", key);
+ if (endkey.length()) {
+ f->dump_string("endkey", endkey);
+ }
+ }
+
+ int approx_size() const {
+ return 6 + 1 +
+ 4 + prefix.size() +
+ 4 + key.size() +
+ 4 + endkey.size() +
+ 4 + bl.length();
+ }
+
+ static void generate_test_instances(std::list<Op*>& ls) {
+ ls.push_back(new Op);
+ // we get coverage here from the Transaction instances
+ }
+ };
+
+ struct Transaction;
+ typedef std::shared_ptr<Transaction> TransactionRef;
+ struct Transaction {
+ std::list<Op> ops;
+ uint64_t bytes, keys;
+
+ Transaction() : bytes(6 + 4 + 8*2), keys(0) {}
+
+ enum {
+ OP_PUT = 1,
+ OP_ERASE = 2,
+ OP_COMPACT = 3,
+ OP_ERASE_RANGE = 4,
+ };
+
+ void put(const std::string& prefix, const std::string& key, const ceph::buffer::list& bl) {
+ ops.push_back(Op(OP_PUT, prefix, key, bl));
+ ++keys;
+ bytes += ops.back().approx_size();
+ }
+
+ void put(const std::string& prefix, version_t ver, const ceph::buffer::list& bl) {
+ std::ostringstream os;
+ os << ver;
+ put(prefix, os.str(), bl);
+ }
+
+ void put(const std::string& prefix, const std::string& key, version_t ver) {
+ using ceph::encode;
+ ceph::buffer::list bl;
+ encode(ver, bl);
+ put(prefix, key, bl);
+ }
+
+ void erase(const std::string& prefix, const std::string& key) {
+ ops.push_back(Op(OP_ERASE, prefix, key));
+ ++keys;
+ bytes += ops.back().approx_size();
+ }
+
+ void erase(const std::string& prefix, version_t ver) {
+ std::ostringstream os;
+ os << ver;
+ erase(prefix, os.str());
+ }
+
+ void erase_range(const std::string& prefix, const std::string& begin,
+ const std::string& end) {
+ ops.push_back(Op(OP_ERASE_RANGE, prefix, begin, end));
+ ++keys;
+ bytes += ops.back().approx_size();
+ }
+
+ void compact_prefix(const std::string& prefix) {
+ ops.push_back(Op(OP_COMPACT, prefix, {}));
+ }
+
+ void compact_range(const std::string& prefix, const std::string& start,
+ const std::string& end) {
+ ops.push_back(Op(OP_COMPACT, prefix, start, end));
+ }
+
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(2, 1, bl);
+ encode(ops, bl);
+ encode(bytes, bl);
+ encode(keys, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ DECODE_START(2, bl);
+ decode(ops, bl);
+ if (struct_v >= 2) {
+ decode(bytes, bl);
+ decode(keys, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+
+ static void generate_test_instances(std::list<Transaction*>& ls) {
+ ls.push_back(new Transaction);
+ ls.push_back(new Transaction);
+ ceph::buffer::list bl;
+ bl.append("value");
+ ls.back()->put("prefix", "key", bl);
+ ls.back()->erase("prefix2", "key2");
+ ls.back()->erase_range("prefix3", "key3", "key4");
+ ls.back()->compact_prefix("prefix3");
+ ls.back()->compact_range("prefix4", "from", "to");
+ }
+
+ void append(TransactionRef other) {
+ ops.splice(ops.end(), other->ops);
+ keys += other->keys;
+ bytes += other->bytes;
+ }
+
+ void append_from_encoded(ceph::buffer::list& bl) {
+ auto other(std::make_shared<Transaction>());
+ auto it = bl.cbegin();
+ other->decode(it);
+ append(other);
+ }
+
+ bool empty() {
+ return (size() == 0);
+ }
+
+ size_t size() const {
+ return ops.size();
+ }
+ uint64_t get_keys() const {
+ return keys;
+ }
+ uint64_t get_bytes() const {
+ return bytes;
+ }
+
+ void dump(ceph::Formatter *f, bool dump_val=false) const {
+ f->open_object_section("transaction");
+ f->open_array_section("ops");
+ int op_num = 0;
+ for (auto it = ops.begin(); it != ops.end(); ++it) {
+ const Op& op = *it;
+ f->open_object_section("op");
+ f->dump_int("op_num", op_num++);
+ switch (op.type) {
+ case OP_PUT:
+ {
+ f->dump_string("type", "PUT");
+ f->dump_string("prefix", op.prefix);
+ f->dump_string("key", op.key);
+ f->dump_unsigned("length", op.bl.length());
+ if (dump_val) {
+ std::ostringstream os;
+ op.bl.hexdump(os);
+ f->dump_string("bl", os.str());
+ }
+ }
+ break;
+ case OP_ERASE:
+ {
+ f->dump_string("type", "ERASE");
+ f->dump_string("prefix", op.prefix);
+ f->dump_string("key", op.key);
+ }
+ break;
+ case OP_ERASE_RANGE:
+ {
+ f->dump_string("type", "ERASE_RANGE");
+ f->dump_string("prefix", op.prefix);
+ f->dump_string("start", op.key);
+ f->dump_string("end", op.endkey);
+ }
+ break;
+ case OP_COMPACT:
+ {
+ f->dump_string("type", "COMPACT");
+ f->dump_string("prefix", op.prefix);
+ f->dump_string("start", op.key);
+ f->dump_string("end", op.endkey);
+ }
+ break;
+ default:
+ {
+ f->dump_string("type", "unknown");
+ f->dump_unsigned("op_code", op.type);
+ break;
+ }
+ }
+ f->close_section();
+ }
+ f->close_section();
+ f->dump_unsigned("num_keys", keys);
+ f->dump_unsigned("num_bytes", bytes);
+ f->close_section();
+ }
+ };
+
+ int apply_transaction(MonitorDBStore::TransactionRef t) {
+ KeyValueDB::Transaction dbt = db->get_transaction();
+
+ if (do_dump) {
+ if (!g_conf()->mon_debug_dump_json) {
+ ceph::buffer::list bl;
+ t->encode(bl);
+ bl.write_fd(dump_fd_binary);
+ } else {
+ t->dump(&dump_fmt, true);
+ dump_fmt.flush(dump_fd_json);
+ dump_fd_json.flush();
+ }
+ }
+
+ std::list<std::pair<std::string, std::pair<std::string,std::string>>> compact;
+ for (auto it = t->ops.begin(); it != t->ops.end(); ++it) {
+ const Op& op = *it;
+ switch (op.type) {
+ case Transaction::OP_PUT:
+ dbt->set(op.prefix, op.key, op.bl);
+ break;
+ case Transaction::OP_ERASE:
+ dbt->rmkey(op.prefix, op.key);
+ break;
+ case Transaction::OP_ERASE_RANGE:
+ dbt->rm_range_keys(op.prefix, op.key, op.endkey);
+ break;
+ case Transaction::OP_COMPACT:
+ compact.push_back(make_pair(op.prefix, make_pair(op.key, op.endkey)));
+ break;
+ default:
+ derr << __func__ << " unknown op type " << op.type << dendl;
+ ceph_abort();
+ break;
+ }
+ }
+ int r = db->submit_transaction_sync(dbt);
+ if (r >= 0) {
+ while (!compact.empty()) {
+ if (compact.front().second.first == std::string() &&
+ compact.front().second.second == std::string())
+ db->compact_prefix_async(compact.front().first);
+ else
+ db->compact_range_async(compact.front().first, compact.front().second.first, compact.front().second.second);
+ compact.pop_front();
+ }
+ } else {
+ ceph_abort_msg("failed to write to db");
+ }
+ return r;
+ }
+
+ struct C_DoTransaction : public Context {
+ MonitorDBStore *store;
+ MonitorDBStore::TransactionRef t;
+ Context *oncommit;
+ C_DoTransaction(MonitorDBStore *s, MonitorDBStore::TransactionRef t,
+ Context *f)
+ : store(s), t(t), oncommit(f)
+ {}
+ void finish(int r) override {
+ /* The store serializes writes. Each transaction is handled
+ * sequentially by the io_work Finisher. If a transaction takes longer
+ * to apply its state to permanent storage, then no other transaction
+ * will be handled meanwhile.
+ *
+ * We will now randomly inject random delays. We can safely sleep prior
+ * to applying the transaction as it won't break the model.
+ */
+ double delay_prob = g_conf()->mon_inject_transaction_delay_probability;
+ if (delay_prob && (rand() % 10000 < delay_prob * 10000.0)) {
+ utime_t delay;
+ double delay_max = g_conf()->mon_inject_transaction_delay_max;
+ delay.set_from_double(delay_max * (double)(rand() % 10000) / 10000.0);
+ lsubdout(g_ceph_context, mon, 1)
+ << "apply_transaction will be delayed for " << delay
+ << " seconds" << dendl;
+ delay.sleep();
+ }
+ int ret = store->apply_transaction(t);
+ oncommit->complete(ret);
+ }
+ };
+
+ /**
+ * queue transaction
+ *
+ * Queue a transaction to commit asynchronously. Trigger a context
+ * on completion (without any locks held).
+ */
+ void queue_transaction(MonitorDBStore::TransactionRef t,
+ Context *oncommit) {
+ io_work.queue(new C_DoTransaction(this, t, oncommit));
+ }
+
+ /**
+ * block and flush all io activity
+ */
+ void flush() {
+ io_work.wait_for_empty();
+ }
+
+ class StoreIteratorImpl {
+ protected:
+ bool done;
+ std::pair<std::string,std::string> last_key;
+ ceph::buffer::list crc_bl;
+
+ StoreIteratorImpl() : done(false) { }
+ virtual ~StoreIteratorImpl() { }
+
+ virtual bool _is_valid() = 0;
+
+ public:
+ __u32 crc() {
+ if (g_conf()->mon_sync_debug)
+ return crc_bl.crc32c(0);
+ return 0;
+ }
+ std::pair<std::string,std::string> get_last_key() {
+ return last_key;
+ }
+ virtual bool has_next_chunk() {
+ return !done && _is_valid();
+ }
+ virtual void get_chunk_tx(TransactionRef tx, uint64_t max_bytes,
+ uint64_t max_keys) = 0;
+ virtual std::pair<std::string,std::string> get_next_key() = 0;
+ };
+ typedef std::shared_ptr<StoreIteratorImpl> Synchronizer;
+
+ class WholeStoreIteratorImpl : public StoreIteratorImpl {
+ KeyValueDB::WholeSpaceIterator iter;
+ std::set<std::string> sync_prefixes;
+
+ public:
+ WholeStoreIteratorImpl(KeyValueDB::WholeSpaceIterator iter,
+ std::set<std::string> &prefixes)
+ : StoreIteratorImpl(),
+ iter(iter),
+ sync_prefixes(prefixes)
+ { }
+
+ ~WholeStoreIteratorImpl() override { }
+
+ /**
+ * Obtain a chunk of the store
+ *
+ * @param bl Encoded transaction that will recreate the chunk
+ * @param first_key Pair containing the first key to obtain, and that
+ * will contain the first key in the chunk (that may
+ * differ from the one passed on to the function)
+ * @param last_key[out] Last key in the chunk
+ */
+ void get_chunk_tx(TransactionRef tx, uint64_t max_bytes,
+ uint64_t max_keys) override {
+ using ceph::encode;
+ ceph_assert(done == false);
+ ceph_assert(iter->valid() == true);
+
+ while (iter->valid()) {
+ std::string prefix(iter->raw_key().first);
+ std::string key(iter->raw_key().second);
+ if (sync_prefixes.count(prefix)) {
+ ceph::buffer::list value = iter->value();
+ if (tx->empty() ||
+ (tx->get_bytes() + value.length() + key.size() +
+ prefix.size() < max_bytes &&
+ tx->get_keys() < max_keys)) {
+ // NOTE: putting every key in a separate transaction is
+ // questionable as far as efficiency goes
+ auto tmp(std::make_shared<Transaction>());
+ tmp->put(prefix, key, value);
+ tx->append(tmp);
+ if (g_conf()->mon_sync_debug) {
+ encode(prefix, crc_bl);
+ encode(key, crc_bl);
+ encode(value, crc_bl);
+ }
+ } else {
+ last_key.first = prefix;
+ last_key.second = key;
+ return;
+ }
+ }
+ iter->next();
+ }
+ ceph_assert(iter->valid() == false);
+ done = true;
+ }
+
+ std::pair<std::string,std::string> get_next_key() override {
+ ceph_assert(iter->valid());
+
+ for (; iter->valid(); iter->next()) {
+ std::pair<std::string,std::string> r = iter->raw_key();
+ if (sync_prefixes.count(r.first) > 0) {
+ iter->next();
+ return r;
+ }
+ }
+ return std::pair<std::string,std::string>();
+ }
+
+ bool _is_valid() override {
+ return iter->valid();
+ }
+ };
+
+ Synchronizer get_synchronizer(std::pair<std::string,std::string> &key,
+ std::set<std::string> &prefixes) {
+ KeyValueDB::WholeSpaceIterator iter;
+ iter = db->get_wholespace_iterator();
+
+ if (!key.first.empty() && !key.second.empty())
+ iter->upper_bound(key.first, key.second);
+ else
+ iter->seek_to_first();
+
+ return std::shared_ptr<StoreIteratorImpl>(
+ new WholeStoreIteratorImpl(iter, prefixes)
+ );
+ }
+
+ KeyValueDB::Iterator get_iterator(const std::string &prefix) {
+ ceph_assert(!prefix.empty());
+ KeyValueDB::Iterator iter = db->get_iterator(prefix);
+ iter->seek_to_first();
+ return iter;
+ }
+
+ KeyValueDB::WholeSpaceIterator get_iterator() {
+ KeyValueDB::WholeSpaceIterator iter;
+ iter = db->get_wholespace_iterator();
+ iter->seek_to_first();
+ return iter;
+ }
+
+ int get(const std::string& prefix, const std::string& key, ceph::buffer::list& bl) {
+ ceph_assert(bl.length() == 0);
+ return db->get(prefix, key, &bl);
+ }
+
+ int get(const std::string& prefix, const version_t ver, ceph::buffer::list& bl) {
+ std::ostringstream os;
+ os << ver;
+ return get(prefix, os.str(), bl);
+ }
+
+ version_t get(const std::string& prefix, const std::string& key) {
+ using ceph::decode;
+ ceph::buffer::list bl;
+ int err = get(prefix, key, bl);
+ if (err < 0) {
+ if (err == -ENOENT) // if key doesn't exist, assume its value is 0
+ return 0;
+ // we're not expecting any other negative return value, and we can't
+ // just return a negative value if we're returning a version_t
+ generic_dout(0) << "MonitorDBStore::get() error obtaining"
+ << " (" << prefix << ":" << key << "): "
+ << cpp_strerror(err) << dendl;
+ ceph_abort_msg("error obtaining key");
+ }
+
+ ceph_assert(bl.length());
+ version_t ver;
+ auto p = bl.cbegin();
+ decode(ver, p);
+ return ver;
+ }
+
+ bool exists(const std::string& prefix, const std::string& key) {
+ KeyValueDB::Iterator it = db->get_iterator(prefix);
+ int err = it->lower_bound(key);
+ if (err < 0)
+ return false;
+
+ return (it->valid() && it->key() == key);
+ }
+
+ bool exists(const std::string& prefix, version_t ver) {
+ std::ostringstream os;
+ os << ver;
+ return exists(prefix, os.str());
+ }
+
+ std::string combine_strings(const std::string& prefix, const std::string& value) {
+ std::string out = prefix;
+ out.push_back('_');
+ out.append(value);
+ return out;
+ }
+
+ std::string combine_strings(const std::string& prefix, const version_t ver) {
+ std::ostringstream os;
+ os << ver;
+ return combine_strings(prefix, os.str());
+ }
+
+ int clear_key(const std::string& prefix, const std::string& key) {
+ ceph_assert(!prefix.empty());
+ ceph_assert(!key.empty());
+ KeyValueDB::Transaction dbt = db->get_transaction();
+ dbt->rmkey(prefix, key);
+ return db->submit_transaction_sync(dbt);
+ }
+
+ void clear(std::set<std::string>& prefixes) {
+ KeyValueDB::Transaction dbt = db->get_transaction();
+
+ for (auto iter = prefixes.begin(); iter != prefixes.end(); ++iter) {
+ dbt->rmkeys_by_prefix((*iter));
+ }
+ int r = db->submit_transaction_sync(dbt);
+ ceph_assert(r >= 0);
+ }
+
+ void _open(const std::string& kv_type) {
+ int pos = 0;
+ for (auto rit = path.rbegin(); rit != path.rend(); ++rit, ++pos) {
+ if (*rit != '/')
+ break;
+ }
+ std::ostringstream os;
+ os << path.substr(0, path.size() - pos) << "/store.db";
+ std::string full_path = os.str();
+
+ KeyValueDB *db_ptr = KeyValueDB::create(g_ceph_context,
+ kv_type,
+ full_path);
+ if (!db_ptr) {
+ derr << __func__ << " error initializing "
+ << kv_type << " db back storage in "
+ << full_path << dendl;
+ ceph_abort_msg("MonitorDBStore: error initializing keyvaluedb back storage");
+ }
+ db.reset(db_ptr);
+
+ if (g_conf()->mon_debug_dump_transactions) {
+ if (!g_conf()->mon_debug_dump_json) {
+ dump_fd_binary = ::open(
+ g_conf()->mon_debug_dump_location.c_str(),
+ O_CREAT|O_APPEND|O_WRONLY|O_CLOEXEC, 0644);
+ if (dump_fd_binary < 0) {
+ dump_fd_binary = -errno;
+ derr << "Could not open log file, got "
+ << cpp_strerror(dump_fd_binary) << dendl;
+ }
+ } else {
+ dump_fmt.reset();
+ dump_fmt.open_array_section("dump");
+ dump_fd_json.open(g_conf()->mon_debug_dump_location.c_str());
+ }
+ do_dump = true;
+ }
+ if (kv_type == "rocksdb")
+ db->init(g_conf()->mon_rocksdb_options);
+ else
+ db->init();
+
+
+ }
+
+ int open(std::ostream &out) {
+ std::string kv_type;
+ int r = read_meta("kv_backend", &kv_type);
+ if (r < 0 || kv_type.empty()) {
+ // assume old monitors that did not mark the type were RocksDB.
+ kv_type = "rocksdb";
+ r = write_meta("kv_backend", kv_type);
+ if (r < 0)
+ return r;
+ }
+ _open(kv_type);
+ r = db->open(out);
+ if (r < 0)
+ return r;
+
+ // Monitors are few in number, so the resource cost of exposing
+ // very detailed stats is low: ramp up the priority of all the
+ // KV store's perf counters. Do this after open, because backend may
+ // not have constructed PerfCounters earlier.
+ if (db->get_perf_counters()) {
+ db->get_perf_counters()->set_prio_adjust(
+ PerfCountersBuilder::PRIO_USEFUL - PerfCountersBuilder::PRIO_DEBUGONLY);
+ }
+
+ io_work.start();
+ is_open = true;
+ return 0;
+ }
+
+ int create_and_open(std::ostream &out) {
+ // record the type before open
+ std::string kv_type;
+ int r = read_meta("kv_backend", &kv_type);
+ if (r < 0) {
+ kv_type = g_conf()->mon_keyvaluedb;
+ r = write_meta("kv_backend", kv_type);
+ if (r < 0)
+ return r;
+ }
+ _open(kv_type);
+ r = db->create_and_open(out);
+ if (r < 0)
+ return r;
+ io_work.start();
+ is_open = true;
+ return 0;
+ }
+
+ void close() {
+ // there should be no work queued!
+ ceph_assert(io_work.is_empty());
+ io_work.stop();
+ is_open = false;
+ db.reset(NULL);
+ }
+
+ void compact() {
+ db->compact();
+ }
+
+ void compact_async() {
+ db->compact_async();
+ }
+
+ void compact_prefix(const std::string& prefix) {
+ db->compact_prefix(prefix);
+ }
+
+ uint64_t get_estimated_size(std::map<std::string, uint64_t> &extras) {
+ return db->get_estimated_size(extras);
+ }
+
+ /**
+ * write_meta - write a simple configuration key out-of-band
+ *
+ * Write a simple key/value pair for basic store configuration
+ * (e.g., a uuid or magic number) to an unopened/unmounted store.
+ * The default implementation writes this to a plaintext file in the
+ * path.
+ *
+ * A newline is appended.
+ *
+ * @param key key name (e.g., "fsid")
+ * @param value value (e.g., a uuid rendered as a string)
+ * @returns 0 for success, or an error code
+ */
+ int write_meta(const std::string& key,
+ const std::string& value) const {
+ std::string v = value;
+ v += "\n";
+ int r = safe_write_file(path.c_str(), key.c_str(),
+ v.c_str(), v.length(),
+ 0600);
+ if (r < 0)
+ return r;
+ return 0;
+ }
+
+ /**
+ * read_meta - read a simple configuration key out-of-band
+ *
+ * Read a simple key value to an unopened/mounted store.
+ *
+ * Trailing whitespace is stripped off.
+ *
+ * @param key key name
+ * @param value pointer to value string
+ * @returns 0 for success, or an error code
+ */
+ int read_meta(const std::string& key,
+ std::string *value) const {
+ char buf[4096];
+ int r = safe_read_file(path.c_str(), key.c_str(),
+ buf, sizeof(buf));
+ if (r <= 0)
+ return r;
+ // drop trailing newlines
+ while (r && isspace(buf[r-1])) {
+ --r;
+ }
+ *value = std::string(buf, r);
+ return 0;
+ }
+
+ explicit MonitorDBStore(const std::string& path)
+ : path(path),
+ db(0),
+ do_dump(false),
+ dump_fd_binary(-1),
+ dump_fmt(true),
+ io_work(g_ceph_context, "monstore", "fn_monstore"),
+ is_open(false) {
+ }
+ ~MonitorDBStore() {
+ ceph_assert(!is_open);
+ if (do_dump) {
+ if (!g_conf()->mon_debug_dump_json) {
+ ::close(dump_fd_binary);
+ } else {
+ dump_fmt.close_section();
+ dump_fmt.flush(dump_fd_json);
+ dump_fd_json.flush();
+ dump_fd_json.close();
+ }
+ }
+ }
+
+};
+
+WRITE_CLASS_ENCODER(MonitorDBStore::Op)
+WRITE_CLASS_ENCODER(MonitorDBStore::Transaction)
+
+#endif /* CEPH_MONITOR_DB_STORE_H */
diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc
new file mode 100644
index 000000000..1226c8a82
--- /dev/null
+++ b/src/mon/MonmapMonitor.cc
@@ -0,0 +1,1462 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "MonmapMonitor.h"
+#include "Monitor.h"
+#include "OSDMonitor.h"
+#include "messages/MMonCommand.h"
+#include "messages/MMonJoin.h"
+
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include <sstream>
+#include "common/config.h"
+#include "common/cmdparse.h"
+
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon)
+using namespace TOPNSPC::common;
+
+using std::cout;
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::setfill;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+using std::unique_ptr;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_message;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+static ostream& _prefix(std::ostream *_dout, Monitor &mon) {
+ return *_dout << "mon." << mon.name << "@" << mon.rank
+ << "(" << mon.get_state_name()
+ << ").monmap v" << mon.monmap->epoch << " ";
+}
+
+void MonmapMonitor::create_initial()
+{
+ dout(10) << __func__ << " using current monmap" << dendl;
+ pending_map = *mon.monmap;
+ pending_map.epoch = 1;
+
+ if (g_conf()->mon_debug_no_initial_persistent_features) {
+ derr << __func__ << " mon_debug_no_initial_persistent_features=true"
+ << dendl;
+ } else {
+ // initialize with default persistent features for new clusters
+ pending_map.persistent_features = ceph::features::mon::get_persistent();
+ pending_map.min_mon_release = ceph_release();
+ }
+}
+
+void MonmapMonitor::update_from_paxos(bool *need_bootstrap)
+{
+ version_t version = get_last_committed();
+ if (version <= mon.monmap->get_epoch())
+ return;
+
+ dout(10) << __func__ << " version " << version
+ << ", my v " << mon.monmap->epoch << dendl;
+
+ if (need_bootstrap && version != mon.monmap->get_epoch()) {
+ dout(10) << " signaling that we need a bootstrap" << dendl;
+ *need_bootstrap = true;
+ }
+
+ // read and decode
+ monmap_bl.clear();
+ int ret = get_version(version, monmap_bl);
+ ceph_assert(ret == 0);
+ ceph_assert(monmap_bl.length());
+
+ dout(10) << __func__ << " got " << version << dendl;
+ mon.monmap->decode(monmap_bl);
+
+ if (mon.store->exists("mkfs", "monmap")) {
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ t->erase("mkfs", "monmap");
+ mon.store->apply_transaction(t);
+ }
+
+ check_subs();
+
+ // make sure we've recorded min_mon_release
+ string val;
+ if (mon.store->read_meta("min_mon_release", &val) < 0 ||
+ val.size() == 0 ||
+ atoi(val.c_str()) != (int)ceph_release()) {
+ dout(10) << __func__ << " updating min_mon_release meta" << dendl;
+ mon.store->write_meta("min_mon_release",
+ stringify(ceph_release()));
+ }
+
+ mon.notify_new_monmap(true);
+}
+
+void MonmapMonitor::create_pending()
+{
+ pending_map = *mon.monmap;
+ pending_map.epoch++;
+ pending_map.last_changed = ceph_clock_now();
+ pending_map.removed_ranks.clear();
+}
+
+void MonmapMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+ dout(10) << __func__ << " epoch " << pending_map.epoch << dendl;
+
+ ceph_assert(mon.monmap->epoch + 1 == pending_map.epoch ||
+ pending_map.epoch == 1); // special case mkfs!
+ bufferlist bl;
+ pending_map.encode(bl, mon.get_quorum_con_features());
+
+ put_version(t, pending_map.epoch, bl);
+ put_last_committed(t, pending_map.epoch);
+
+ // generate a cluster fingerprint, too?
+ if (pending_map.epoch == 1) {
+ mon.prepare_new_fingerprint(t);
+ }
+
+ //health
+ health_check_map_t next;
+ pending_map.check_health(&next);
+ encode_health(next, t);
+}
+
+class C_ApplyFeatures : public Context {
+ MonmapMonitor *svc;
+ mon_feature_t features;
+ ceph_release_t min_mon_release;
+public:
+ C_ApplyFeatures(MonmapMonitor *s, const mon_feature_t& f, ceph_release_t mmr) :
+ svc(s), features(f), min_mon_release(mmr) { }
+ void finish(int r) override {
+ if (r >= 0) {
+ svc->apply_mon_features(features, min_mon_release);
+ } else if (r == -EAGAIN || r == -ECANCELED) {
+ // discard features if we're no longer on the quorum that
+ // established them in the first place.
+ return;
+ } else {
+ ceph_abort_msg("bad C_ApplyFeatures return value");
+ }
+ }
+};
+
+void MonmapMonitor::apply_mon_features(const mon_feature_t& features,
+ ceph_release_t min_mon_release)
+{
+ if (!is_writeable()) {
+ dout(5) << __func__ << " wait for service to be writeable" << dendl;
+ wait_for_writeable_ctx(new C_ApplyFeatures(this, features, min_mon_release));
+ return;
+ }
+
+ // do nothing here unless we have a full quorum
+ if (mon.get_quorum().size() < mon.monmap->size()) {
+ return;
+ }
+
+ ceph_assert(is_writeable());
+ ceph_assert(features.contains_all(pending_map.persistent_features));
+ // we should never hit this because `features` should be the result
+ // of the quorum's supported features. But if it happens, die.
+ ceph_assert(ceph::features::mon::get_supported().contains_all(features));
+
+ mon_feature_t new_features =
+ (pending_map.persistent_features ^
+ (features & ceph::features::mon::get_persistent()));
+
+ if (new_features.empty() &&
+ pending_map.min_mon_release == min_mon_release) {
+ dout(10) << __func__ << " min_mon_release (" << (int)min_mon_release
+ << ") and features (" << features << ") match" << dendl;
+ return;
+ }
+
+ if (!new_features.empty()) {
+ dout(1) << __func__ << " applying new features "
+ << new_features << ", had " << pending_map.persistent_features
+ << ", will have "
+ << (new_features | pending_map.persistent_features)
+ << dendl;
+ pending_map.persistent_features |= new_features;
+ }
+ if (min_mon_release > pending_map.min_mon_release) {
+ dout(1) << __func__ << " increasing min_mon_release to "
+ << to_integer<int>(min_mon_release) << " (" << min_mon_release
+ << ")" << dendl;
+ pending_map.min_mon_release = min_mon_release;
+ }
+
+ propose_pending();
+}
+
+void MonmapMonitor::on_active()
+{
+ if (get_last_committed() >= 1 && !mon.has_ever_joined) {
+ // make note of the fact that i was, once, part of the quorum.
+ dout(10) << "noting that i was, once, part of an active quorum." << dendl;
+
+ /* This is some form of nasty in-breeding we have between the MonmapMonitor
+ and the Monitor itself. We should find a way to get rid of it given our
+ new architecture. Until then, stick with it since we are a
+ single-threaded process and, truth be told, no one else relies on this
+ thing besides us.
+ */
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ t->put(Monitor::MONITOR_NAME, "joined", 1);
+ mon.store->apply_transaction(t);
+ mon.has_ever_joined = true;
+ }
+
+ if (mon.is_leader()) {
+ mon.clog->debug() << "monmap " << *mon.monmap;
+ }
+
+ apply_mon_features(mon.get_quorum_mon_features(),
+ mon.quorum_min_mon_release);
+
+ mon.update_pending_metadata();
+}
+
+bool MonmapMonitor::preprocess_query(MonOpRequestRef op)
+{
+ auto m = op->get_req<PaxosServiceMessage>();
+ switch (m->get_type()) {
+ // READs
+ case MSG_MON_COMMAND:
+ try {
+ return preprocess_command(op);
+ }
+ catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
+ case MSG_MON_JOIN:
+ return preprocess_join(op);
+ default:
+ ceph_abort();
+ return true;
+ }
+}
+
+void MonmapMonitor::dump_info(Formatter *f)
+{
+ f->dump_unsigned("monmap_first_committed", get_first_committed());
+ f->dump_unsigned("monmap_last_committed", get_last_committed());
+ f->open_object_section("monmap");
+ mon.monmap->dump(f);
+ f->close_section();
+ f->open_array_section("quorum");
+ for (set<int>::iterator q = mon.get_quorum().begin(); q != mon.get_quorum().end(); ++q)
+ f->dump_int("mon", *q);
+ f->close_section();
+}
+
+bool MonmapMonitor::preprocess_command(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonCommand>();
+ int r = -1;
+ bufferlist rdata;
+ stringstream ss;
+
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+ return true;
+ }
+
+ string prefix;
+ cmd_getval(cmdmap, "prefix", prefix);
+
+ MonSession *session = op->get_session();
+ if (!session) {
+ mon.reply_command(op, -EACCES, "access denied", get_last_committed());
+ return true;
+ }
+
+ string format = cmd_getval_or<string>(cmdmap, "format", "plain");
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+ if (prefix == "mon stat") {
+ if (f) {
+ f->open_object_section("monmap");
+ mon.monmap->dump_summary(f.get());
+ f->dump_string("leader", mon.get_leader_name());
+ f->open_array_section("quorum");
+ for (auto rank: mon.get_quorum()) {
+ std::string name = mon.monmap->get_name(rank);
+ f->open_object_section("mon");
+ f->dump_int("rank", rank);
+ f->dump_string("name", name);
+ f->close_section(); // mon
+ }
+ f->close_section(); // quorum
+ f->close_section(); // monmap
+ f->flush(ss);
+ } else {
+ mon.monmap->print_summary(ss);
+ ss << ", election epoch " << mon.get_epoch() << ", leader "
+ << mon.get_leader() << " " << mon.get_leader_name()
+ << ", quorum " << mon.get_quorum()
+ << " " << mon.get_quorum_names();
+ }
+
+ rdata.append(ss);
+ ss.str("");
+ r = 0;
+
+ } else if (prefix == "mon getmap" ||
+ prefix == "mon dump") {
+
+ epoch_t epoch;
+ int64_t epochnum = cmd_getval_or<int64_t>(cmdmap, "epoch", 0);
+ epoch = epochnum;
+
+ MonMap *p = mon.monmap;
+ if (epoch) {
+ bufferlist bl;
+ r = get_version(epoch, bl);
+ if (r == -ENOENT) {
+ ss << "there is no map for epoch " << epoch;
+ goto reply;
+ }
+ ceph_assert(r == 0);
+ ceph_assert(bl.length() > 0);
+ p = new MonMap;
+ p->decode(bl);
+ }
+
+ ceph_assert(p);
+
+ if (prefix == "mon getmap") {
+ p->encode(rdata, m->get_connection()->get_features());
+ r = 0;
+ ss << "got monmap epoch " << p->get_epoch();
+ } else if (prefix == "mon dump") {
+ stringstream ds;
+ if (f) {
+ f->open_object_section("monmap");
+ p->dump(f.get());
+ f->open_array_section("quorum");
+ for (set<int>::iterator q = mon.get_quorum().begin();
+ q != mon.get_quorum().end(); ++q) {
+ f->dump_int("mon", *q);
+ }
+ f->close_section();
+ f->close_section();
+ f->flush(ds);
+ r = 0;
+ } else {
+ p->print(ds);
+ r = 0;
+ }
+ rdata.append(ds);
+ ss << "dumped monmap epoch " << p->get_epoch();
+ }
+ if (p != mon.monmap) {
+ delete p;
+ p = nullptr;
+ }
+
+ } else if (prefix == "mon feature ls") {
+
+ bool list_with_value = false;
+ cmd_getval_compat_cephbool(cmdmap, "with_value", list_with_value);
+
+ MonMap *p = mon.monmap;
+
+ // list features
+ mon_feature_t supported = ceph::features::mon::get_supported();
+ mon_feature_t persistent = ceph::features::mon::get_persistent();
+ mon_feature_t required = p->get_required_features();
+
+ stringstream ds;
+ auto print_feature = [&](mon_feature_t& m_features, const char* m_str) {
+ if (f) {
+ if (list_with_value)
+ m_features.dump_with_value(f.get(), m_str);
+ else
+ m_features.dump(f.get(), m_str);
+ } else {
+ if (list_with_value)
+ m_features.print_with_value(ds);
+ else
+ m_features.print(ds);
+ }
+ };
+
+ if (f) {
+ f->open_object_section("features");
+
+ f->open_object_section("all");
+ print_feature(supported, "supported");
+ print_feature(persistent, "persistent");
+ f->close_section(); // all
+
+ f->open_object_section("monmap");
+ print_feature(p->persistent_features, "persistent");
+ print_feature(p->optional_features, "optional");
+ print_feature(required, "required");
+ f->close_section(); // monmap
+
+ f->close_section(); // features
+ f->flush(ds);
+
+ } else {
+ ds << "all features" << std::endl
+ << "\tsupported: ";
+ print_feature(supported, nullptr);
+ ds << std::endl
+ << "\tpersistent: ";
+ print_feature(persistent, nullptr);
+ ds << std::endl
+ << std::endl;
+
+ ds << "on current monmap (epoch "
+ << p->get_epoch() << ")" << std::endl
+ << "\tpersistent: ";
+ print_feature(p->persistent_features, nullptr);
+ ds << std::endl
+ // omit optional features in plain-text
+ // makes it easier to read, and they're, currently, empty.
+ << "\trequired: ";
+ print_feature(required, nullptr);
+ ds << std::endl;
+ }
+ rdata.append(ds);
+ r = 0;
+ }
+
+reply:
+ if (r != -1) {
+ string rs;
+ getline(ss, rs);
+
+ mon.reply_command(op, r, rs, rdata, get_last_committed());
+ return true;
+ } else
+ return false;
+}
+
+
+bool MonmapMonitor::prepare_update(MonOpRequestRef op)
+{
+ auto m = op->get_req<PaxosServiceMessage>();
+ dout(7) << __func__ << " " << *m << " from " << m->get_orig_source_inst() << dendl;
+
+ switch (m->get_type()) {
+ case MSG_MON_COMMAND:
+ try {
+ return prepare_command(op);
+ } catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return false;
+ }
+ case MSG_MON_JOIN:
+ return prepare_join(op);
+ default:
+ ceph_abort();
+ }
+
+ return false;
+}
+
+bool MonmapMonitor::prepare_command(MonOpRequestRef op)
+{
+ auto m = op->get_req<MMonCommand>();
+ stringstream ss;
+ int err;
+ MonSession *session = nullptr;
+
+
+ /* We should follow the following rules:
+ *
+ * - 'monmap' is the current, consistent version of the monmap
+ * - 'pending_map' is the uncommitted version of the monmap
+ *
+ * All checks for the current state must be made against 'monmap'.
+ * All changes are made against 'pending_map'.
+ *
+ * If there are concurrent operations modifying 'pending_map', please
+ * follow the following rules.
+ *
+ * - if pending_map has already been changed, the second operation must
+ * wait for the proposal to finish and be run again; This is the easiest
+ * path to guarantee correctness but may impact performance (i.e., it
+ * will take longer for the user to get a reply).
+ *
+ * - if the result of the second operation can be guaranteed to be
+ * idempotent, the operation may reply to the user once the proposal
+ * finishes; still needs to wait for the proposal to finish.
+ *
+ * - An operation _NEVER_ returns to the user based on pending state.
+ *
+ * If an operation does not modify current stable monmap, it may be
+ * serialized before current pending map, regardless of any change that
+ * has been made to the pending map -- remember, pending is uncommitted
+ * state, thus we are not bound by it.
+ */
+
+ ceph_assert(mon.monmap);
+ MonMap &monmap = *mon.monmap;
+
+ cmdmap_t cmdmap;
+ string prefix;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ cmd_getval(cmdmap, "prefix", prefix);
+
+ session = op->get_session();
+ if (!session) {
+ err = -EACCES;
+ goto reply_no_propose;
+ }
+
+ /* Please note:
+ *
+ * Adding or removing monitors may lead to loss of quorum.
+ *
+ * Because quorum may be lost, it's important to reply something
+ * to the user, lest she end up waiting forever for a reply. And
+ * no reply will ever be sent until quorum is formed again.
+ *
+ * On the other hand, this means we're leaking uncommitted state
+ * to the user. As such, please be mindful of the reply message.
+ *
+ * e.g., 'adding monitor mon.foo' is okay ('adding' is an on-going
+ * operation and conveys its not-yet-permanent nature); whereas
+ * 'added monitor mon.foo' presumes the action has successfully
+ * completed and state has been committed, which may not be true.
+ */
+
+
+ if (prefix == "mon add") {
+ string name;
+ cmd_getval(cmdmap, "name", name);
+ string addrstr;
+ cmd_getval(cmdmap, "addr", addrstr);
+ entity_addr_t addr;
+ bufferlist rdata;
+
+ if (!addr.parse(addrstr)) {
+ err = -EINVAL;
+ ss << "addr " << addrstr << "does not parse";
+ goto reply_no_propose;
+ }
+
+ vector<string> locationvec;
+ map<string, string> loc;
+ cmd_getval(cmdmap, "location", locationvec);
+ CrushWrapper::parse_loc_map(locationvec, &loc);
+ if (locationvec.size() &&
+ !mon.get_quorum_mon_features().contains_all(
+ ceph::features::mon::FEATURE_PINGING)) {
+ err = -ENOTSUP;
+ ss << "Not all monitors support adding monitors with a location; please upgrade first!";
+ goto reply_no_propose;
+ }
+ if (locationvec.size() && !loc.size()) {
+ ss << "We could not parse your input location to anything real; " << locationvec
+ << " turned into an empty map!";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ dout(10) << "mon add setting location for " << name << " to " << loc << dendl;
+
+ // TODO: validate location in crush map
+ if (monmap.stretch_mode_enabled && !loc.size()) {
+ ss << "We are in stretch mode and new monitors must have a location, but "
+ << "could not parse your input location to anything real; " << locationvec
+ << " turned into an empty map!";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ // TODO: validate location against any existing stretch config
+
+ entity_addrvec_t addrs;
+ if (monmap.persistent_features.contains_all(
+ ceph::features::mon::FEATURE_NAUTILUS)) {
+ if (addr.get_port() == CEPH_MON_PORT_IANA) {
+ addr.set_type(entity_addr_t::TYPE_MSGR2);
+ }
+ if (addr.get_port() == CEPH_MON_PORT_LEGACY) {
+ // if they specified the *old* default they probably don't care
+ addr.set_port(0);
+ }
+ if (addr.get_port()) {
+ addrs.v.push_back(addr);
+ } else {
+ addr.set_type(entity_addr_t::TYPE_MSGR2);
+ addr.set_port(CEPH_MON_PORT_IANA);
+ addrs.v.push_back(addr);
+ addr.set_type(entity_addr_t::TYPE_LEGACY);
+ addr.set_port(CEPH_MON_PORT_LEGACY);
+ addrs.v.push_back(addr);
+ }
+ } else {
+ if (addr.get_port() == 0) {
+ addr.set_port(CEPH_MON_PORT_LEGACY);
+ }
+ addr.set_type(entity_addr_t::TYPE_LEGACY);
+ addrs.v.push_back(addr);
+ }
+ dout(20) << __func__ << " addr " << addr << " -> addrs " << addrs << dendl;
+
+ /**
+ * If we have a monitor with the same name and different addr, then EEXIST
+ * If we have a monitor with the same addr and different name, then EEXIST
+ * If we have a monitor with the same addr and same name, then wait for
+ * the proposal to finish and return success.
+ * If we don't have the monitor, add it.
+ */
+
+ err = 0;
+ if (!ss.str().empty())
+ ss << "; ";
+
+ do {
+ if (monmap.contains(name)) {
+ if (monmap.get_addrs(name) == addrs) {
+ // stable map contains monitor with the same name at the same address.
+ // serialize before current pending map.
+ err = 0; // for clarity; this has already been set above.
+ ss << "mon." << name << " at " << addrs << " already exists";
+ goto reply_no_propose;
+ } else {
+ ss << "mon." << name
+ << " already exists at address " << monmap.get_addrs(name);
+ }
+ } else if (monmap.contains(addrs)) {
+ // we established on the previous branch that name is different
+ ss << "mon." << monmap.get_name(addrs)
+ << " already exists at address " << addr;
+ } else {
+ // go ahead and add
+ break;
+ }
+ err = -EEXIST;
+ goto reply_no_propose;
+ } while (false);
+
+ if (pending_map.stretch_mode_enabled) {
+
+ }
+
+ /* Given there's no delay between proposals on the MonmapMonitor (see
+ * MonmapMonitor::should_propose()), there is no point in checking for
+ * a mismatch between name and addr on pending_map.
+ *
+ * Once we established the monitor does not exist in the committed state,
+ * we can simply go ahead and add the monitor.
+ */
+
+ pending_map.add(name, addrs);
+ pending_map.mon_info[name].crush_loc = loc;
+ pending_map.last_changed = ceph_clock_now();
+ ss << "adding mon." << name << " at " << addrs;
+ dout(0) << __func__ << " proposing new mon." << name << dendl;
+
+ } else if (prefix == "mon remove" ||
+ prefix == "mon rm") {
+ string name;
+ cmd_getval(cmdmap, "name", name);
+ if (!monmap.contains(name)) {
+ err = 0;
+ ss << "mon." << name << " does not exist or has already been removed";
+ goto reply_no_propose;
+ }
+
+ if (monmap.size() == 1) {
+ err = -EINVAL;
+ ss << "error: refusing removal of last monitor " << name;
+ goto reply_no_propose;
+ }
+
+ if (pending_map.stretch_mode_enabled &&
+ name == pending_map.tiebreaker_mon) {
+ err = -EINVAL;
+ ss << "you cannot remove stretch mode's tiebreaker monitor";
+ goto reply_no_propose;
+ }
+ /* At the time of writing, there is no risk of races when multiple clients
+ * attempt to use the same name. The reason is simple but may not be
+ * obvious.
+ *
+ * In a nutshell, we do not collate proposals on the MonmapMonitor. As
+ * soon as we return 'true' below, PaxosService::dispatch() will check if
+ * the service should propose, and - if so - the service will be marked as
+ * 'proposing' and a proposal will be triggered. The PaxosService class
+ * guarantees that once a service is marked 'proposing' no further writes
+ * will be handled.
+ *
+ * The decision on whether the service should propose or not is, in this
+ * case, made by MonmapMonitor::should_propose(), which always considers
+ * the proposal delay being 0.0 seconds. This is key for PaxosService to
+ * trigger the proposal immediately.
+ * 0.0 seconds of delay.
+ *
+ * From the above, there's no point in performing further checks on the
+ * pending_map, as we don't ever have multiple proposals in-flight in
+ * this service. As we've established the committed state contains the
+ * monitor, we can simply go ahead and remove it.
+ *
+ * Please note that the code hinges on all of the above to be true. It
+ * has been true since time immemorial and we don't see a good reason
+ * to make it sturdier at this time - mainly because we don't think it's
+ * going to change any time soon, lest for any bug that may be unwillingly
+ * introduced.
+ */
+
+ entity_addrvec_t addrs = pending_map.get_addrs(name);
+ pending_map.remove(name);
+ pending_map.disallowed_leaders.erase(name);
+ pending_map.last_changed = ceph_clock_now();
+ } else if (prefix == "mon feature set") {
+
+ /* PLEASE NOTE:
+ *
+ * We currently only support setting/unsetting persistent features.
+ * This is by design, given at the moment we still don't have optional
+ * features, and, as such, there is no point introducing an interface
+ * to manipulate them. This allows us to provide a cleaner, more
+ * intuitive interface to the user, modifying solely persistent
+ * features.
+ *
+ * In the future we should consider adding another interface to handle
+ * optional features/flags; e.g., 'mon feature flag set/unset', or
+ * 'mon flag set/unset'.
+ */
+ string feature_name;
+ if (!cmd_getval(cmdmap, "feature_name", feature_name)) {
+ ss << "missing required feature name";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ mon_feature_t feature;
+ feature = ceph::features::mon::get_feature_by_name(feature_name);
+ if (feature == ceph::features::mon::FEATURE_NONE) {
+ ss << "unknown feature '" << feature_name << "'";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) {
+ ss << "please specify '--yes-i-really-mean-it' if you "
+ << "really, **really** want to set feature '"
+ << feature << "' in the monmap.";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+
+ if (!mon.get_quorum_mon_features().contains_all(feature)) {
+ ss << "current quorum does not support feature '" << feature
+ << "'; supported features: "
+ << mon.get_quorum_mon_features();
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ ss << "setting feature '" << feature << "'";
+
+ if (monmap.persistent_features.contains_all(feature)) {
+ err = 0;
+ ss << " feature '" << feature << "' already set on monmap";
+ goto reply_no_propose;
+ }
+
+ pending_map.persistent_features.set_feature(feature);
+ pending_map.last_changed = ceph_clock_now();
+
+ dout(1) << __func__ << " " << ss.str() << "; new features will be: "
+ << "persistent = " << pending_map.persistent_features
+ // output optional nevertheless, for auditing purposes.
+ << ", optional = " << pending_map.optional_features << dendl;
+
+ } else if (prefix == "mon set-rank") {
+ string name;
+ int64_t rank;
+ if (!cmd_getval(cmdmap, "name", name) ||
+ !cmd_getval(cmdmap, "rank", rank)) {
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ int oldrank = pending_map.get_rank(name);
+ if (oldrank < 0) {
+ ss << "mon." << name << " does not exist in monmap";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ pending_map.set_rank(name, rank);
+ pending_map.last_changed = ceph_clock_now();
+ } else if (prefix == "mon set-addrs") {
+ string name;
+ string addrs;
+ if (!cmd_getval(cmdmap, "name", name) ||
+ !cmd_getval(cmdmap, "addrs", addrs)) {
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (!pending_map.contains(name)) {
+ ss << "mon." << name << " does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ entity_addrvec_t av;
+ if (!av.parse(addrs.c_str(), nullptr)) {
+ ss << "failed to parse addrs '" << addrs << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ for (auto& a : av.v) {
+ a.set_nonce(0);
+ if (!a.get_port()) {
+ ss << "monitor must bind to a non-zero port, not " << a;
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ }
+ pending_map.set_addrvec(name, av);
+ pending_map.last_changed = ceph_clock_now();
+ } else if (prefix == "mon set-weight") {
+ string name;
+ int64_t weight;
+ if (!cmd_getval(cmdmap, "name", name) ||
+ !cmd_getval(cmdmap, "weight", weight)) {
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (!pending_map.contains(name)) {
+ ss << "mon." << name << " does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ pending_map.set_weight(name, weight);
+ pending_map.last_changed = ceph_clock_now();
+ } else if (prefix == "mon enable-msgr2") {
+ if (!monmap.get_required_features().contains_all(
+ ceph::features::mon::FEATURE_NAUTILUS)) {
+ err = -EACCES;
+ ss << "all monitors must be running nautilus to enable v2";
+ goto reply_no_propose;
+ }
+ err = -EALREADY;
+ for (auto& i : pending_map.mon_info) {
+ if (i.second.public_addrs.v.size() == 1 &&
+ i.second.public_addrs.front().is_legacy() &&
+ i.second.public_addrs.front().get_port() == CEPH_MON_PORT_LEGACY) {
+ entity_addrvec_t av;
+ entity_addr_t a = i.second.public_addrs.front();
+ a.set_type(entity_addr_t::TYPE_MSGR2);
+ a.set_port(CEPH_MON_PORT_IANA);
+ av.v.push_back(a);
+ av.v.push_back(i.second.public_addrs.front());
+ dout(10) << " setting mon." << i.first
+ << " addrs " << i.second.public_addrs
+ << " -> " << av << dendl;
+ pending_map.set_addrvec(i.first, av);
+ pending_map.last_changed = ceph_clock_now();
+ err = 0;
+ }
+ }
+ if (err == -EALREADY) {
+ err = 0;
+ ss << "all monitors have already enabled msrg2";
+ goto reply_no_propose;
+ }
+ } else if (prefix == "mon set election_strategy") {
+ if (!mon.get_quorum_mon_features().contains_all(
+ ceph::features::mon::FEATURE_PINGING)) {
+ err = -ENOTSUP;
+ ss << "Not all monitors support changing election strategies; please upgrade first!";
+ goto reply_no_propose;
+ }
+ string strat;
+ MonMap::election_strategy strategy;
+ if (!cmd_getval(cmdmap, "strategy", strat)) {
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (strat == "classic") {
+ strategy = MonMap::CLASSIC;
+ } else if (strat == "disallow") {
+ strategy = MonMap::DISALLOW;
+ } else if (strat == "connectivity") {
+ strategy = MonMap::CONNECTIVITY;
+ } else {
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (strategy == pending_map.strategy) {
+ err = 0;
+ goto reply_no_propose;
+ }
+ pending_map.strategy = strategy;
+ pending_map.last_changed = ceph_clock_now();
+ } else if (prefix == "mon add disallowed_leader") {
+ if (!mon.get_quorum_mon_features().contains_all(
+ ceph::features::mon::FEATURE_PINGING)) {
+ err = -ENOTSUP;
+ ss << "Not all monitors support changing election strategies; please upgrade first!";
+ goto reply_no_propose;
+ }
+ string name;
+ if (!cmd_getval(cmdmap, "name", name)) {
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (pending_map.strategy != MonMap::DISALLOW &&
+ pending_map.strategy != MonMap::CONNECTIVITY) {
+ ss << "You cannot disallow monitors in your current election mode";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (!pending_map.contains(name)) {
+ ss << "mon." << name << " does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ if (pending_map.disallowed_leaders.count(name)) {
+ ss << "mon." << name << " is already disallowed";
+ err = 0;
+ goto reply_no_propose;
+ }
+ if (pending_map.disallowed_leaders.size() == pending_map.size() - 1) {
+ ss << "mon." << name << " is the only remaining allowed leader!";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ pending_map.disallowed_leaders.insert(name);
+ pending_map.last_changed = ceph_clock_now();
+ } else if (prefix == "mon rm disallowed_leader") {
+ if (!mon.get_quorum_mon_features().contains_all(
+ ceph::features::mon::FEATURE_PINGING)) {
+ err = -ENOTSUP;
+ ss << "Not all monitors support changing election strategies; please upgrade first!";
+ goto reply_no_propose;
+ }
+ string name;
+ if (!cmd_getval(cmdmap, "name", name)) {
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (pending_map.strategy != MonMap::DISALLOW &&
+ pending_map.strategy != MonMap::CONNECTIVITY) {
+ ss << "You cannot disallow monitors in your current election mode";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (!pending_map.contains(name)) {
+ ss << "mon." << name << " does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ if (!pending_map.disallowed_leaders.count(name)) {
+ ss << "mon." << name << " is already allowed";
+ err = 0;
+ goto reply_no_propose;
+ }
+ pending_map.disallowed_leaders.erase(name);
+ pending_map.last_changed = ceph_clock_now();
+ } else if (prefix == "mon set_location") {
+ if (!mon.get_quorum_mon_features().contains_all(
+ ceph::features::mon::FEATURE_PINGING)) {
+ err = -ENOTSUP;
+ ss << "Not all monitors support monitor locations; please upgrade first!";
+ goto reply_no_propose;
+ }
+ string name;
+ if (!cmd_getval(cmdmap, "name", name)) {
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (!pending_map.contains(name)) {
+ ss << "mon." << name << " does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+
+ vector<string> argvec;
+ map<string, string> loc;
+ cmd_getval(cmdmap, "args", argvec);
+ CrushWrapper::parse_loc_map(argvec, &loc);
+
+ dout(10) << "mon set_location for " << name << " to " << loc << dendl;
+
+ // TODO: validate location in crush map
+ if (!loc.size()) {
+ ss << "We could not parse your input location to anything real; " << argvec
+ << " turned into an empty map!";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ // TODO: validate location against any existing stretch config
+ pending_map.mon_info[name].crush_loc = loc;
+ pending_map.last_changed = ceph_clock_now();
+ } else if (prefix == "mon set_new_tiebreaker") {
+ if (!pending_map.stretch_mode_enabled) {
+ err = -EINVAL;
+ ss << "Stretch mode is not enabled, so there is no tiebreaker";
+ goto reply_no_propose;
+ }
+ string name;
+ if (!cmd_getval(cmdmap, "name", name)) {
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+
+ const auto &existing_tiebreaker_info_i = pending_map.mon_info.find(pending_map.tiebreaker_mon);
+ const auto &new_tiebreaker_info_i = pending_map.mon_info.find(name);
+ if (new_tiebreaker_info_i == pending_map.mon_info.end()) {
+ ss << "mon." << name << " does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ const auto& new_info = new_tiebreaker_info_i->second;
+ if (new_info.crush_loc.empty()) {
+ ss << "mon." << name << " does not have a location specified";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ if (!mon.osdmon()->is_readable()) {
+ dout(10) << __func__
+ << ": waiting for osdmon readable to inspect crush barrier"
+ << dendl;
+ mon.osdmon()->wait_for_readable(op, new Monitor::C_RetryMessage(&mon, op));
+ return false; /* do not propose, yet */
+ }
+ int32_t stretch_divider_id = mon.osdmon()->osdmap.stretch_mode_bucket;
+ string stretch_bucket_divider = mon.osdmon()->osdmap.crush->
+ get_type_name(stretch_divider_id);
+
+ const auto& new_loc_i = new_info.crush_loc.find(stretch_bucket_divider);
+ if (new_loc_i == new_info.crush_loc.end()) {
+ ss << "mon." << name << " has a specificed location, but not a "
+ << stretch_bucket_divider << ", which is the stretch divider";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ const string& new_loc = new_loc_i->second;
+ set<string> matching_mons;
+ for (const auto& mii : pending_map.mon_info) {
+ const auto& other_loc_i = mii.second.crush_loc.find(stretch_bucket_divider);
+ if (mii.first == name) {
+ continue;
+ }
+ if (other_loc_i == mii.second.crush_loc.end()) { // huh
+ continue;
+ }
+ const string& other_loc = other_loc_i->second;
+ if (other_loc == new_loc &&
+ mii.first != existing_tiebreaker_info_i->first) {
+ matching_mons.insert(mii.first);
+ }
+ }
+ if (!matching_mons.empty()) {
+ ss << "mon." << name << " has location " << new_loc_i->second
+ << ", which matches mons " << matching_mons << " on the "
+ << stretch_bucket_divider << " dividing bucket for stretch mode. "
+ "Pass --yes-i-really-mean-it if you're sure you want to do this."
+ "(You really don't.)";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ pending_map.tiebreaker_mon = name;
+ pending_map.disallowed_leaders.insert(name);
+ pending_map.last_changed = ceph_clock_now();
+ } else if (prefix == "mon enable_stretch_mode") {
+ if (!mon.osdmon()->is_writeable()) {
+ dout(10) << __func__
+ << ": waiting for osdmon writeable for stretch mode" << dendl;
+ mon.osdmon()->wait_for_writeable(op, new Monitor::C_RetryMessage(&mon, op));
+ return false; /* do not propose, yet */
+ }
+ {
+ if (monmap.stretch_mode_enabled) {
+ ss << "stretch mode is already engaged";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (pending_map.stretch_mode_enabled) {
+ ss << "stretch mode currently committing";
+ err = 0;
+ goto reply_no_propose;
+ }
+ string tiebreaker_mon;
+ if (!cmd_getval(cmdmap, "tiebreaker_mon", tiebreaker_mon)) {
+ ss << "must specify a tiebreaker monitor";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ string new_crush_rule;
+ if (!cmd_getval(cmdmap, "new_crush_rule", new_crush_rule)) {
+ ss << "must specify a new crush rule that spreads out copies over multiple sites";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ string dividing_bucket;
+ if (!cmd_getval(cmdmap, "dividing_bucket", dividing_bucket)) {
+ ss << "must specify a dividing bucket";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ //okay, initial arguments make sense, check pools and cluster state
+ err = mon.osdmon()->check_cluster_features(CEPH_FEATUREMASK_STRETCH_MODE, ss);
+ if (err)
+ goto reply_no_propose;
+ struct Plugger {
+ Paxos &p;
+ Plugger(Paxos &p) : p(p) { p.plug(); }
+ ~Plugger() { p.unplug(); }
+ } plugger(paxos);
+
+ set<pg_pool_t*> pools;
+ bool okay = false;
+ int errcode = 0;
+
+ mon.osdmon()->try_enable_stretch_mode_pools(ss, &okay, &errcode,
+ &pools, new_crush_rule);
+ if (!okay) {
+ err = errcode;
+ goto reply_no_propose;
+ }
+ try_enable_stretch_mode(ss, &okay, &errcode, false,
+ tiebreaker_mon, dividing_bucket);
+ if (!okay) {
+ err = errcode;
+ goto reply_no_propose;
+ }
+ mon.osdmon()->try_enable_stretch_mode(ss, &okay, &errcode, false,
+ dividing_bucket, 2, pools, new_crush_rule);
+ if (!okay) {
+ err = errcode;
+ goto reply_no_propose;
+ }
+ // everything looks good, actually commit the changes!
+ try_enable_stretch_mode(ss, &okay, &errcode, true,
+ tiebreaker_mon, dividing_bucket);
+ mon.osdmon()->try_enable_stretch_mode(ss, &okay, &errcode, true,
+ dividing_bucket,
+ 2, // right now we only support 2 sites
+ pools, new_crush_rule);
+ ceph_assert(okay == true);
+ }
+ request_proposal(mon.osdmon());
+ } else {
+ ss << "unknown command " << prefix;
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ err = 0;
+ goto reply_propose;
+
+reply_no_propose:
+ {
+ string rs;
+ getline(ss, rs);
+ if (err < 0 && rs.size() == 0)
+ rs = cpp_strerror(err);
+ mon.reply_command(op, err, rs, get_last_committed());
+ return false;
+ }
+
+reply_propose:
+ {
+ string rs;
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
+ return true;
+ }
+}
+
+void MonmapMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
+ int *errcode, bool commit,
+ const string& tiebreaker_mon,
+ const string& dividing_bucket)
+{
+ dout(20) << __func__ << dendl;
+ *okay = false;
+ if (pending_map.strategy != MonMap::CONNECTIVITY) {
+ ss << "Monitors must use the connectivity strategy to enable stretch mode";
+ *errcode = -EINVAL;
+ ceph_assert(!commit);
+ return;
+ }
+ if (!pending_map.contains(tiebreaker_mon)) {
+ ss << "mon " << tiebreaker_mon << "does not seem to exist";
+ *errcode = -ENOENT;
+ ceph_assert(!commit);
+ return;
+ }
+ map<string,string> buckets;
+ for (const auto&mii : mon.monmap->mon_info) {
+ const auto& mi = mii.second;
+ const auto& bi = mi.crush_loc.find(dividing_bucket);
+ if (bi == mi.crush_loc.end()) {
+ ss << "Could not find location entry for " << dividing_bucket
+ << " on monitor " << mi.name;
+ *errcode = -EINVAL;
+ ceph_assert(!commit);
+ return;
+ }
+ buckets[mii.first] = bi->second;
+ }
+ string bucket1, bucket2, tiebreaker_bucket;
+ for (auto& i : buckets) {
+ if (i.first == tiebreaker_mon) {
+ tiebreaker_bucket = i.second;
+ continue;
+ }
+ if (bucket1.empty()) {
+ bucket1 = i.second;
+ }
+ if (bucket1 != i.second &&
+ bucket2.empty()) {
+ bucket2 = i.second;
+ }
+ if (bucket1 != i.second &&
+ bucket2 != i.second) {
+ ss << "There are too many monitor buckets for stretch mode, found "
+ << bucket1 << "," << bucket2 << "," << i.second;
+ *errcode = -EINVAL;
+ ceph_assert(!commit);
+ return;
+ }
+ }
+ if (bucket1.empty() || bucket2.empty()) {
+ ss << "There are not enough monitor buckets for stretch mode;"
+ << " must have at least 2 plus the tiebreaker but only found "
+ << (bucket1.empty() ? bucket1 : bucket2);
+ *errcode = -EINVAL;
+ ceph_assert(!commit);
+ return;
+ }
+ if (tiebreaker_bucket == bucket1 ||
+ tiebreaker_bucket == bucket2) {
+ ss << "The named tiebreaker monitor " << tiebreaker_mon
+ << " is in the same CRUSH bucket " << tiebreaker_bucket
+ << " as other monitors";
+ *errcode = -EINVAL;
+ ceph_assert(!commit);
+ return;
+ }
+ if (commit) {
+ pending_map.disallowed_leaders.insert(tiebreaker_mon);
+ pending_map.tiebreaker_mon = tiebreaker_mon;
+ pending_map.stretch_mode_enabled = true;
+ }
+ *okay = true;
+}
+
+void MonmapMonitor::trigger_degraded_stretch_mode(const set<string>& dead_mons)
+{
+ dout(20) << __func__ << dendl;
+ pending_map.stretch_marked_down_mons.insert(dead_mons.begin(), dead_mons.end());
+ propose_pending();
+}
+
+void MonmapMonitor::trigger_healthy_stretch_mode()
+{
+ dout(20) << __func__ << dendl;
+ pending_map.stretch_marked_down_mons.clear();
+ propose_pending();
+}
+
+bool MonmapMonitor::preprocess_join(MonOpRequestRef op)
+{
+ auto join = op->get_req<MMonJoin>();
+ dout(10) << __func__ << " " << join->name << " at " << join->addrs << dendl;
+
+ MonSession *session = op->get_session();
+ if (!session ||
+ !session->is_capable("mon", MON_CAP_W | MON_CAP_X)) {
+ dout(10) << " insufficient caps" << dendl;
+ return true;
+ }
+
+ const auto name_info_i = pending_map.mon_info.find(join->name);
+ if (name_info_i != pending_map.mon_info.end() &&
+ !name_info_i->second.public_addrs.front().is_blank_ip() &&
+ (!join->force_loc || join->crush_loc == name_info_i->second.crush_loc)) {
+ dout(10) << " already have " << join->name << dendl;
+ return true;
+ }
+ string addr_name;
+ if (pending_map.contains(join->addrs)) {
+ addr_name = pending_map.get_name(join->addrs);
+ }
+ if (!addr_name.empty() &&
+ addr_name == join->name &&
+ (!join->force_loc || join->crush_loc.empty() ||
+ pending_map.mon_info[addr_name].crush_loc == join->crush_loc)) {
+ dout(10) << " already have " << join->addrs << dendl;
+ return true;
+ }
+ if (pending_map.stretch_mode_enabled &&
+ join->crush_loc.empty() &&
+ (addr_name.empty() ||
+ pending_map.mon_info[addr_name].crush_loc.empty())) {
+ dout(10) << "stretch mode engaged but no source of crush_loc" << dendl;
+ mon.clog->info() << join->name << " attempted to join from " << join->name
+ << ' ' << join->addrs
+ << "; but lacks a crush_location for stretch mode";
+ return true;
+ }
+ return false;
+}
+
+bool MonmapMonitor::prepare_join(MonOpRequestRef op)
+{
+ auto join = op->get_req<MMonJoin>();
+ dout(0) << "adding/updating " << join->name
+ << " at " << join->addrs << " to monitor cluster" << dendl;
+ map<string,string> existing_loc;
+ if (pending_map.contains(join->addrs)) {
+ string name = pending_map.get_name(join->addrs);
+ existing_loc = pending_map.mon_info[name].crush_loc;
+ pending_map.remove(name);
+ }
+ if (pending_map.contains(join->name))
+ pending_map.remove(join->name);
+ pending_map.add(join->name, join->addrs);
+ pending_map.mon_info[join->name].crush_loc =
+ ((join->force_loc || existing_loc.empty()) ?
+ join->crush_loc : existing_loc);
+ pending_map.last_changed = ceph_clock_now();
+ return true;
+}
+
+bool MonmapMonitor::should_propose(double& delay)
+{
+ delay = 0.0;
+ return true;
+}
+
+int MonmapMonitor::get_monmap(bufferlist &bl)
+{
+ version_t latest_ver = get_last_committed();
+ dout(10) << __func__ << " ver " << latest_ver << dendl;
+
+ if (!mon.store->exists(get_service_name(), stringify(latest_ver)))
+ return -ENOENT;
+
+ int err = get_version(latest_ver, bl);
+ if (err < 0) {
+ dout(1) << __func__ << " error obtaining monmap: "
+ << cpp_strerror(err) << dendl;
+ return err;
+ }
+ return 0;
+}
+
+void MonmapMonitor::check_subs()
+{
+ const string type = "monmap";
+ mon.with_session_map([this, &type](const MonSessionMap& session_map) {
+ auto subs = session_map.subs.find(type);
+ if (subs == session_map.subs.end())
+ return;
+ for (auto sub : *subs->second) {
+ check_sub(sub);
+ }
+ });
+}
+
+void MonmapMonitor::check_sub(Subscription *sub)
+{
+ const auto epoch = mon.monmap->get_epoch();
+ dout(10) << __func__
+ << " monmap next " << sub->next
+ << " have " << epoch << dendl;
+ if (sub->next <= epoch) {
+ mon.send_latest_monmap(sub->session->con.get());
+ if (sub->onetime) {
+ mon.with_session_map([sub](MonSessionMap& session_map) {
+ session_map.remove_sub(sub);
+ });
+ } else {
+ sub->next = epoch + 1;
+ }
+ }
+}
+
+void MonmapMonitor::tick()
+{
+ if (!is_active() ||
+ !mon.is_leader()) {
+ return;
+ }
+
+ if (mon.monmap->created.is_zero()) {
+ dout(10) << __func__ << " detected empty created stamp" << dendl;
+ utime_t ctime;
+ for (version_t v = 1; v <= get_last_committed(); v++) {
+ bufferlist bl;
+ int r = get_version(v, bl);
+ if (r < 0) {
+ continue;
+ }
+ MonMap m;
+ auto p = bl.cbegin();
+ decode(m, p);
+ if (!m.last_changed.is_zero()) {
+ dout(10) << __func__ << " first monmap with last_changed is "
+ << v << " with " << m.last_changed << dendl;
+ ctime = m.last_changed;
+ break;
+ }
+ }
+ if (ctime.is_zero()) {
+ ctime = ceph_clock_now();
+ }
+ dout(10) << __func__ << " updating created stamp to " << ctime << dendl;
+ pending_map.created = ctime;
+ propose_pending();
+ }
+}
diff --git a/src/mon/MonmapMonitor.h b/src/mon/MonmapMonitor.h
new file mode 100644
index 000000000..dc75e9620
--- /dev/null
+++ b/src/mon/MonmapMonitor.h
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/*
+ * The Monmap Monitor is used to track the monitors in the cluster.
+ */
+
+#ifndef CEPH_MONMAPMONITOR_H
+#define CEPH_MONMAPMONITOR_H
+
+#include <map>
+#include <set>
+
+#include "include/types.h"
+#include "msg/Messenger.h"
+
+#include "PaxosService.h"
+#include "MonMap.h"
+#include "MonitorDBStore.h"
+
+class MonmapMonitor : public PaxosService {
+ public:
+ MonmapMonitor(Monitor &mn, Paxos &p, const std::string& service_name)
+ : PaxosService(mn, p, service_name)
+ {
+ }
+ MonMap pending_map; //the pending map awaiting passage
+
+ void create_initial() override;
+
+ void update_from_paxos(bool *need_bootstrap) override;
+
+ void create_pending() override;
+
+ void encode_pending(MonitorDBStore::TransactionRef t) override;
+ // we always encode the full map; we have no use for full versions
+ void encode_full(MonitorDBStore::TransactionRef t) override { }
+
+ void on_active() override;
+ void apply_mon_features(const mon_feature_t& features,
+ ceph_release_t min_mon_release);
+
+ void dump_info(ceph::Formatter *f);
+
+ bool preprocess_query(MonOpRequestRef op) override;
+ bool prepare_update(MonOpRequestRef op) override;
+
+ bool preprocess_join(MonOpRequestRef op);
+ bool prepare_join(MonOpRequestRef op);
+
+ bool preprocess_command(MonOpRequestRef op);
+ bool prepare_command(MonOpRequestRef op);
+
+ int get_monmap(ceph::buffer::list &bl);
+
+ /*
+ * Since monitors are pretty
+ * important, this implementation will just write 0.0.
+ */
+ bool should_propose(double& delay) override;
+
+ void check_sub(Subscription *sub);
+
+ void tick() override;
+
+private:
+ void check_subs();
+ ceph::buffer::list monmap_bl;
+ /**
+ * Check validity of inputs and monitor state to
+ * engage stretch mode. Designed to be used with
+ * OSDMonitor::try_enable_stretch_mode() where we call both twice,
+ * first with commit=false to validate.
+ * @param ss: a stringstream to write errors into
+ * @param okay: Filled to true if okay, false if validation fails
+ * @param errcode: filled with -errno if there's a problem
+ * @param commit: true if we should commit the change, false if just testing
+ * @param tiebreaker_mon: the name of the monitor to declare tiebreaker
+ * @param dividing_bucket: the bucket type (eg 'dc') that divides the cluster
+ */
+ void try_enable_stretch_mode(std::stringstream& ss, bool *okay,
+ int *errcode, bool commit,
+ const std::string& tiebreaker_mon,
+ const std::string& dividing_bucket);
+
+public:
+ /**
+ * Set us to degraded stretch mode. Put the dead_mons in
+ * the MonMap.
+ */
+ void trigger_degraded_stretch_mode(const std::set<std::string>& dead_mons);
+ /**
+ * Set us to healthy stretch mode: clear out the
+ * down list to allow any non-tiebreaker mon to be the leader again.
+ */
+ void trigger_healthy_stretch_mode();
+};
+
+
+#endif
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
new file mode 100644
index 000000000..a1593fe14
--- /dev/null
+++ b/src/mon/OSDMonitor.cc
@@ -0,0 +1,15036 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <algorithm>
+#include <boost/algorithm/string.hpp>
+#include <experimental/iterator>
+#include <locale>
+#include <sstream>
+
+#include "mon/OSDMonitor.h"
+#include "mon/Monitor.h"
+#include "mon/MDSMonitor.h"
+#include "mon/MgrStatMonitor.h"
+#include "mon/AuthMonitor.h"
+#include "mon/KVMonitor.h"
+
+#include "mon/MonitorDBStore.h"
+#include "mon/Session.h"
+
+#include "crush/CrushWrapper.h"
+#include "crush/CrushTester.h"
+#include "crush/CrushTreeDumper.h"
+
+#include "messages/MOSDBeacon.h"
+#include "messages/MOSDFailure.h"
+#include "messages/MOSDMarkMeDown.h"
+#include "messages/MOSDMarkMeDead.h"
+#include "messages/MOSDFull.h"
+#include "messages/MOSDMap.h"
+#include "messages/MMonGetOSDMap.h"
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDAlive.h"
+#include "messages/MPoolOp.h"
+#include "messages/MPoolOpReply.h"
+#include "messages/MOSDPGCreate2.h"
+#include "messages/MOSDPGCreated.h"
+#include "messages/MOSDPGTemp.h"
+#include "messages/MOSDPGReadyToMerge.h"
+#include "messages/MMonCommand.h"
+#include "messages/MRemoveSnaps.h"
+#include "messages/MRoute.h"
+#include "messages/MMonGetPurgedSnaps.h"
+#include "messages/MMonGetPurgedSnapsReply.h"
+
+#include "common/TextTable.h"
+#include "common/Timer.h"
+#include "common/ceph_argparse.h"
+#include "common/perf_counters.h"
+#include "common/PriorityCache.h"
+#include "common/strtol.h"
+#include "common/numa.h"
+
+#include "common/config.h"
+#include "common/errno.h"
+
+#include "erasure-code/ErasureCodePlugin.h"
+#include "compressor/Compressor.h"
+#include "common/Checksummer.h"
+
+#include "include/compat.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "include/util.h"
+#include "common/cmdparse.h"
+#include "include/str_list.h"
+#include "include/str_map.h"
+#include "include/scope_guard.h"
+#include "perfglue/heap_profiler.h"
+
+#include "auth/cephx/CephxKeyServer.h"
+#include "osd/OSDCap.h"
+
+#include "json_spirit/json_spirit_reader.h"
+
+#include <boost/algorithm/string/predicate.hpp>
+
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::ErasureCodeInterfaceRef;
+using ceph::ErasureCodePluginRegistry;
+using ceph::ErasureCodeProfile;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_message;
+
+#define dout_subsys ceph_subsys_mon
+static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
+static const string OSD_METADATA_PREFIX("osd_metadata");
+static const string OSD_SNAP_PREFIX("osd_snap");
+
+/*
+
+ OSD snapshot metadata
+ ---------------------
+
+ -- starting with mimic, removed in octopus --
+
+ "removed_epoch_%llu_%08lx" % (pool, epoch)
+ -> interval_set<snapid_t>
+
+ "removed_snap_%llu_%016llx" % (pool, last_snap)
+ -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
+
+
+ -- starting with mimic --
+
+ "purged_snap_%llu_%016llx" % (pool, last_snap)
+ -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
+
+ - note that the {removed,purged}_snap put the last snap in they key so
+ that we can use forward iteration only to search for an epoch in an
+ interval. e.g., to test if epoch N is removed/purged, we'll find a key
+ >= N that either does or doesn't contain the given snap.
+
+
+ -- starting with octopus --
+
+ "purged_epoch_%08lx" % epoch
+ -> map<int64_t,interval_set<snapid_t>>
+
+ */
+using namespace TOPNSPC::common;
+namespace {
+
+struct OSDMemCache : public PriorityCache::PriCache {
+ OSDMonitor *osdmon;
+ int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
+ int64_t committed_bytes = 0;
+ double cache_ratio = 0;
+
+ OSDMemCache(OSDMonitor *m) : osdmon(m) {};
+
+ virtual uint64_t _get_used_bytes() const = 0;
+
+ virtual int64_t request_cache_bytes(
+ PriorityCache::Priority pri, uint64_t total_cache) const {
+ int64_t assigned = get_cache_bytes(pri);
+
+ switch (pri) {
+ // All cache items are currently set to have PRI1 priority
+ case PriorityCache::Priority::PRI1:
+ {
+ int64_t request = _get_used_bytes();
+ return (request > assigned) ? request - assigned : 0;
+ }
+ default:
+ break;
+ }
+ return -EOPNOTSUPP;
+ }
+
+ virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
+ return cache_bytes[pri];
+ }
+
+ virtual int64_t get_cache_bytes() const {
+ int64_t total = 0;
+
+ for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
+ PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
+ total += get_cache_bytes(pri);
+ }
+ return total;
+ }
+
+ virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+ cache_bytes[pri] = bytes;
+ }
+ virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+ cache_bytes[pri] += bytes;
+ }
+ virtual int64_t commit_cache_size(uint64_t total_cache) {
+ committed_bytes = PriorityCache::get_chunk(
+ get_cache_bytes(), total_cache);
+ return committed_bytes;
+ }
+ virtual int64_t get_committed_size() const {
+ return committed_bytes;
+ }
+ virtual double get_cache_ratio() const {
+ return cache_ratio;
+ }
+ virtual void set_cache_ratio(double ratio) {
+ cache_ratio = ratio;
+ }
+ virtual void shift_bins() {
+ }
+ virtual void import_bins(const std::vector<uint64_t> &bins) {
+ }
+ virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) {
+ }
+ virtual uint64_t get_bins(PriorityCache::Priority pri) const {
+ return 0;
+ }
+
+ virtual string get_cache_name() const = 0;
+};
+
+struct IncCache : public OSDMemCache {
+ IncCache(OSDMonitor *m) : OSDMemCache(m) {};
+
+ virtual uint64_t _get_used_bytes() const {
+ return osdmon->inc_osd_cache.get_bytes();
+ }
+
+ virtual string get_cache_name() const {
+ return "OSDMap Inc Cache";
+ }
+
+ uint64_t _get_num_osdmaps() const {
+ return osdmon->inc_osd_cache.get_size();
+ }
+};
+
+struct FullCache : public OSDMemCache {
+ FullCache(OSDMonitor *m) : OSDMemCache(m) {};
+
+ virtual uint64_t _get_used_bytes() const {
+ return osdmon->full_osd_cache.get_bytes();
+ }
+
+ virtual string get_cache_name() const {
+ return "OSDMap Full Cache";
+ }
+
+ uint64_t _get_num_osdmaps() const {
+ return osdmon->full_osd_cache.get_size();
+ }
+};
+
+std::shared_ptr<IncCache> inc_cache;
+std::shared_ptr<FullCache> full_cache;
+
+const uint32_t MAX_POOL_APPLICATIONS = 4;
+const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
+const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
+
+bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
+ // Note: this doesn't include support for the application tag match
+ if ((grant.spec.allow & OSD_CAP_W) != 0) {
+ auto& match = grant.match;
+ if (match.is_match_all()) {
+ return true;
+ } else if (pool_name != nullptr &&
+ !match.pool_namespace.pool_name.empty() &&
+ match.pool_namespace.pool_name == *pool_name) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool is_unmanaged_snap_op_permitted(CephContext* cct,
+ const KeyServer& key_server,
+ const EntityName& entity_name,
+ const MonCap& mon_caps,
+ const entity_addr_t& peer_socket_addr,
+ const std::string* pool_name)
+{
+ typedef std::map<std::string, std::string> CommandArgs;
+
+ if (mon_caps.is_capable(
+ cct, entity_name, "osd",
+ "osd pool op unmanaged-snap",
+ (pool_name == nullptr ?
+ CommandArgs{} /* pool DNE, require unrestricted cap */ :
+ CommandArgs{{"poolname", *pool_name}}),
+ false, true, false,
+ peer_socket_addr)) {
+ return true;
+ }
+
+ AuthCapsInfo caps_info;
+ if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
+ caps_info)) {
+ dout(10) << "unable to locate OSD cap data for " << entity_name
+ << " in auth db" << dendl;
+ return false;
+ }
+
+ string caps_str;
+ if (caps_info.caps.length() > 0) {
+ auto p = caps_info.caps.cbegin();
+ try {
+ decode(caps_str, p);
+ } catch (const ceph::buffer::error &err) {
+ derr << "corrupt OSD cap data for " << entity_name << " in auth db"
+ << dendl;
+ return false;
+ }
+ }
+
+ OSDCap osd_cap;
+ if (!osd_cap.parse(caps_str, nullptr)) {
+ dout(10) << "unable to parse OSD cap data for " << entity_name
+ << " in auth db" << dendl;
+ return false;
+ }
+
+ // if the entity has write permissions in one or all pools, permit
+ // usage of unmanaged-snapshots
+ if (osd_cap.allow_all()) {
+ return true;
+ }
+
+ for (auto& grant : osd_cap.grants) {
+ if (grant.profile.is_valid()) {
+ for (auto& profile_grant : grant.profile_grants) {
+ if (is_osd_writable(profile_grant, pool_name)) {
+ return true;
+ }
+ }
+ } else if (is_osd_writable(grant, pool_name)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+} // anonymous namespace
+
+void LastEpochClean::Lec::report(unsigned pg_num, ps_t ps,
+ epoch_t last_epoch_clean)
+{
+ if (ps >= pg_num) {
+ // removed PG
+ return;
+ }
+ epoch_by_pg.resize(pg_num, 0);
+ const auto old_lec = epoch_by_pg[ps];
+ if (old_lec >= last_epoch_clean) {
+ // stale lec
+ return;
+ }
+ epoch_by_pg[ps] = last_epoch_clean;
+ if (last_epoch_clean < floor) {
+ floor = last_epoch_clean;
+ } else if (last_epoch_clean > floor) {
+ if (old_lec == floor) {
+ // probably should increase floor?
+ auto new_floor = std::min_element(std::begin(epoch_by_pg),
+ std::end(epoch_by_pg));
+ floor = *new_floor;
+ }
+ }
+ if (ps != next_missing) {
+ return;
+ }
+ for (; next_missing < epoch_by_pg.size(); next_missing++) {
+ if (epoch_by_pg[next_missing] == 0) {
+ break;
+ }
+ }
+}
+
+void LastEpochClean::remove_pool(uint64_t pool)
+{
+ report_by_pool.erase(pool);
+}
+
+void LastEpochClean::report(unsigned pg_num, const pg_t& pg,
+ epoch_t last_epoch_clean)
+{
+ auto& lec = report_by_pool[pg.pool()];
+ return lec.report(pg_num, pg.ps(), last_epoch_clean);
+}
+
+epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
+{
+ auto floor = latest.get_epoch();
+ for (auto& pool : latest.get_pools()) {
+ auto reported = report_by_pool.find(pool.first);
+ if (reported == report_by_pool.end()) {
+ return 0;
+ }
+ if (reported->second.next_missing < pool.second.get_pg_num()) {
+ return 0;
+ }
+ if (reported->second.floor < floor) {
+ floor = reported->second.floor;
+ }
+ }
+ return floor;
+}
+
+void LastEpochClean::dump(Formatter *f) const
+{
+ f->open_array_section("per_pool");
+
+ for (auto& [pool, lec] : report_by_pool) {
+ f->open_object_section("pool");
+ f->dump_unsigned("poolid", pool);
+ f->dump_unsigned("floor", lec.floor);
+ f->close_section();
+ }
+
+ f->close_section();
+}
+
+class C_UpdateCreatingPGs : public Context {
+public:
+ OSDMonitor *osdmon;
+ utime_t start;
+ epoch_t epoch;
+ C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
+ osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
+ void finish(int r) override {
+ if (r >= 0) {
+ utime_t end = ceph_clock_now();
+ dout(10) << "osdmap epoch " << epoch << " mapping took "
+ << (end - start) << " seconds" << dendl;
+ osdmon->update_creating_pgs();
+ osdmon->check_pg_creates_subs();
+ }
+ }
+};
+
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, osdmap)
+static ostream& _prefix(std::ostream *_dout, Monitor &mon, const OSDMap& osdmap) {
+ return *_dout << "mon." << mon.name << "@" << mon.rank
+ << "(" << mon.get_state_name()
+ << ").osd e" << osdmap.get_epoch() << " ";
+}
+
+OSDMonitor::OSDMonitor(
+ CephContext *cct,
+ Monitor &mn,
+ Paxos &p,
+ const string& service_name)
+ : PaxosService(mn, p, service_name),
+ cct(cct),
+ inc_osd_cache(g_conf()->mon_osd_cache_size),
+ full_osd_cache(g_conf()->mon_osd_cache_size),
+ has_osdmap_manifest(false),
+ mapper(mn.cct, &mn.cpu_tp)
+{
+ inc_cache = std::make_shared<IncCache>(this);
+ full_cache = std::make_shared<FullCache>(this);
+ cct->_conf.add_observer(this);
+ int r = _set_cache_sizes();
+ if (r < 0) {
+ derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
+ << g_conf()->mon_osd_cache_size
+ << ") without priority cache management"
+ << dendl;
+ }
+}
+
+const char **OSDMonitor::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "mon_memory_target",
+ "mon_memory_autotune",
+ "rocksdb_cache_size",
+ NULL
+ };
+ return KEYS;
+}
+
+void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed)
+{
+ dout(10) << __func__ << " " << changed << dendl;
+
+ if (changed.count("mon_memory_autotune")) {
+ _set_cache_autotuning();
+ }
+ if (changed.count("mon_memory_target") ||
+ changed.count("rocksdb_cache_size")) {
+ int r = _update_mon_cache_settings();
+ if (r < 0) {
+ derr << __func__ << " mon_memory_target:"
+ << g_conf()->mon_memory_target
+ << " rocksdb_cache_size:"
+ << g_conf()->rocksdb_cache_size
+ << ". Unable to update cache size."
+ << dendl;
+ }
+ }
+}
+
+void OSDMonitor::_set_cache_autotuning()
+{
+ if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
+ // Disable cache autotuning
+ std::lock_guard l(balancer_lock);
+ pcm = nullptr;
+ }
+
+ if (g_conf()->mon_memory_autotune && pcm == nullptr) {
+ int r = register_cache_with_pcm();
+ if (r < 0) {
+ dout(10) << __func__
+ << " Error while registering osdmon caches with pcm."
+ << " Cache auto tuning not enabled."
+ << dendl;
+ mon_memory_autotune = false;
+ } else {
+ mon_memory_autotune = true;
+ }
+ }
+}
+
+int OSDMonitor::_update_mon_cache_settings()
+{
+ if (g_conf()->mon_memory_target <= 0 ||
+ g_conf()->mon_memory_target < mon_memory_min ||
+ g_conf()->rocksdb_cache_size <= 0) {
+ return -EINVAL;
+ }
+
+ if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
+ derr << __func__ << " not using pcm and rocksdb" << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t old_mon_memory_target = mon_memory_target;
+ uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
+
+ // Set the new pcm memory cache sizes
+ mon_memory_target = g_conf()->mon_memory_target;
+ rocksdb_cache_size = g_conf()->rocksdb_cache_size;
+
+ uint64_t base = mon_memory_base;
+ double fragmentation = mon_memory_fragmentation;
+ uint64_t target = mon_memory_target;
+ uint64_t min = mon_memory_min;
+ uint64_t max = min;
+
+ uint64_t ltarget = (1.0 - fragmentation) * target;
+ if (ltarget > base + min) {
+ max = ltarget - base;
+ }
+
+ int r = _set_cache_ratios();
+ if (r < 0) {
+ derr << __func__ << " Cache ratios for pcm could not be set."
+ << " Review the kv (rocksdb) and mon_memory_target sizes."
+ << dendl;
+ mon_memory_target = old_mon_memory_target;
+ rocksdb_cache_size = old_rocksdb_cache_size;
+ return -EINVAL;
+ }
+
+ if (mon_memory_autotune && pcm != nullptr) {
+ std::lock_guard l(balancer_lock);
+ // set pcm cache levels
+ pcm->set_target_memory(target);
+ pcm->set_min_memory(min);
+ pcm->set_max_memory(max);
+ // tune memory based on new values
+ pcm->tune_memory();
+ pcm->balance();
+ _set_new_cache_sizes();
+ dout(1) << __func__ << " Updated mon cache setting."
+ << " target: " << target
+ << " min: " << min
+ << " max: " << max
+ << dendl;
+ }
+ return 0;
+}
+
+int OSDMonitor::_set_cache_sizes()
+{
+ if (g_conf()->mon_memory_autotune) {
+ // set the new osdmon cache targets to be managed by pcm
+ mon_osd_cache_size = g_conf()->mon_osd_cache_size;
+ rocksdb_cache_size = g_conf()->rocksdb_cache_size;
+ mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
+ mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
+ mon_memory_target = g_conf()->mon_memory_target;
+ mon_memory_min = g_conf()->mon_osd_cache_size_min;
+ if (mon_memory_target <= 0 || mon_memory_min <= 0) {
+ derr << __func__ << " mon_memory_target:" << mon_memory_target
+ << " mon_memory_min:" << mon_memory_min
+ << ". Invalid size option(s) provided."
+ << dendl;
+ return -EINVAL;
+ }
+ // Set the initial inc and full LRU cache sizes
+ inc_osd_cache.set_bytes(mon_memory_min);
+ full_osd_cache.set_bytes(mon_memory_min);
+ mon_memory_autotune = g_conf()->mon_memory_autotune;
+ }
+ return 0;
+}
+
+bool OSDMonitor::_have_pending_crush()
+{
+ return pending_inc.crush.length() > 0;
+}
+
+CrushWrapper &OSDMonitor::_get_stable_crush()
+{
+ return *osdmap.crush;
+}
+
+CrushWrapper OSDMonitor::_get_pending_crush()
+{
+ bufferlist bl;
+ if (pending_inc.crush.length())
+ bl = pending_inc.crush;
+ else
+ osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+
+ auto p = bl.cbegin();
+ CrushWrapper crush;
+ crush.decode(p);
+ return crush;
+}
+
+void OSDMonitor::create_initial()
+{
+ dout(10) << "create_initial for " << mon.monmap->fsid << dendl;
+
+ OSDMap newmap;
+
+ bufferlist bl;
+ mon.store->get("mkfs", "osdmap", bl);
+
+ if (bl.length()) {
+ newmap.decode(bl);
+ newmap.set_fsid(mon.monmap->fsid);
+ } else {
+ newmap.build_simple(cct, 0, mon.monmap->fsid, 0);
+ }
+ newmap.set_epoch(1);
+ newmap.created = newmap.modified = ceph_clock_now();
+
+ // new clusters should sort bitwise by default.
+ newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
+
+ newmap.flags |=
+ CEPH_OSDMAP_RECOVERY_DELETES |
+ CEPH_OSDMAP_PURGED_SNAPDIRS |
+ CEPH_OSDMAP_PGLOG_HARDLIMIT;
+ newmap.full_ratio = g_conf()->mon_osd_full_ratio;
+ if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
+ newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
+ if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
+ newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
+ if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
+
+ // new cluster should require latest by default
+ if (g_conf().get_val<bool>("mon_debug_no_require_squid")) {
+ if (g_conf().get_val<bool>("mon_debug_no_require_reef")) {
+ derr << __func__ << " mon_debug_no_require_squid and reef=true" << dendl;
+ newmap.require_osd_release = ceph_release_t::quincy;
+ } else {
+ derr << __func__ << " mon_debug_no_require_squid=true" << dendl;
+ newmap.require_osd_release = ceph_release_t::reef;
+ }
+ } else {
+ newmap.require_osd_release = ceph_release_t::squid;
+ }
+
+ ceph_release_t r = ceph_release_from_name(g_conf()->mon_osd_initial_require_min_compat_client);
+ if (!r) {
+ ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
+ }
+ newmap.require_min_compat_client = r;
+
+ // encode into pending incremental
+ uint64_t features = newmap.get_encoding_features();
+ newmap.encode(pending_inc.fullmap,
+ features | CEPH_FEATURE_RESERVED);
+ pending_inc.full_crc = newmap.get_crc();
+ dout(20) << " full crc " << pending_inc.full_crc << dendl;
+}
+
+void OSDMonitor::get_store_prefixes(std::set<string>& s) const
+{
+ s.insert(service_name);
+ s.insert(OSD_PG_CREATING_PREFIX);
+ s.insert(OSD_METADATA_PREFIX);
+ s.insert(OSD_SNAP_PREFIX);
+}
+
+void OSDMonitor::update_from_paxos(bool *need_bootstrap)
+{
+ // we really don't care if the version has been updated, because we may
+ // have trimmed without having increased the last committed; yet, we may
+ // need to update the in-memory manifest.
+ load_osdmap_manifest();
+
+ version_t version = get_last_committed();
+ if (version == osdmap.epoch)
+ return;
+ ceph_assert(version > osdmap.epoch);
+
+ dout(15) << "update_from_paxos paxos e " << version
+ << ", my e " << osdmap.epoch << dendl;
+
+ int prev_num_up_osd = osdmap.num_up_osd;
+
+ if (mapping_job) {
+ if (!mapping_job->is_done()) {
+ dout(1) << __func__ << " mapping job "
+ << mapping_job.get() << " did not complete, "
+ << mapping_job->shards << " left, canceling" << dendl;
+ mapping_job->abort();
+ }
+ mapping_job.reset();
+ }
+
+ load_health();
+
+ /*
+ * We will possibly have a stashed latest that *we* wrote, and we will
+ * always be sure to have the oldest full map in the first..last range
+ * due to encode_trim_extra(), which includes the oldest full map in the trim
+ * transaction.
+ *
+ * encode_trim_extra() does not however write the full map's
+ * version to 'full_latest'. This is only done when we are building the
+ * full maps from the incremental versions. But don't panic! We make sure
+ * that the following conditions find whichever full map version is newer.
+ */
+ version_t latest_full = get_version_latest_full();
+ if (latest_full == 0 && get_first_committed() > 1)
+ latest_full = get_first_committed();
+
+ if (get_first_committed() > 1 &&
+ latest_full < get_first_committed()) {
+ // the monitor could be just sync'ed with its peer, and the latest_full key
+ // is not encoded in the paxos commits in encode_pending(), so we need to
+ // make sure we get it pointing to a proper version.
+ version_t lc = get_last_committed();
+ version_t fc = get_first_committed();
+
+ dout(10) << __func__ << " looking for valid full map in interval"
+ << " [" << fc << ", " << lc << "]" << dendl;
+
+ latest_full = 0;
+ for (version_t v = lc; v >= fc; v--) {
+ string full_key = "full_" + stringify(v);
+ if (mon.store->exists(get_service_name(), full_key)) {
+ dout(10) << __func__ << " found latest full map v " << v << dendl;
+ latest_full = v;
+ break;
+ }
+ }
+
+ ceph_assert(latest_full > 0);
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ put_version_latest_full(t, latest_full);
+ mon.store->apply_transaction(t);
+ dout(10) << __func__ << " updated the on-disk full map version to "
+ << latest_full << dendl;
+ }
+
+ if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
+ bufferlist latest_bl;
+ get_version_full(latest_full, latest_bl);
+ ceph_assert(latest_bl.length() != 0);
+ dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
+ osdmap = OSDMap();
+ osdmap.decode(latest_bl);
+ }
+
+ bufferlist bl;
+ if (!mon.store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
+ auto p = bl.cbegin();
+ std::lock_guard<std::mutex> l(creating_pgs_lock);
+ creating_pgs.decode(p);
+ dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
+ << creating_pgs.last_scan_epoch
+ << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
+ } else {
+ dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
+ << dendl;
+ }
+
+ // walk through incrementals
+ MonitorDBStore::TransactionRef t;
+ size_t tx_size = 0;
+ while (version > osdmap.epoch) {
+ bufferlist inc_bl;
+ int err = get_version(osdmap.epoch+1, inc_bl);
+ ceph_assert(err == 0);
+ ceph_assert(inc_bl.length());
+ // set priority cache manager levels if the osdmap is
+ // being populated for the first time.
+ if (mon_memory_autotune && pcm == nullptr) {
+ int r = register_cache_with_pcm();
+ if (r < 0) {
+ dout(10) << __func__
+ << " Error while registering osdmon caches with pcm."
+ << " Proceeding without cache auto tuning."
+ << dendl;
+ }
+ }
+
+ dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
+ << dendl;
+ OSDMap::Incremental inc(inc_bl);
+ err = osdmap.apply_incremental(inc);
+ ceph_assert(err == 0);
+
+ if (!t)
+ t.reset(new MonitorDBStore::Transaction);
+
+ // Write out the full map for all past epochs. Encode the full
+ // map with the same features as the incremental. If we don't
+ // know, use the quorum features. If we don't know those either,
+ // encode with all features.
+ uint64_t f = inc.encode_features;
+ if (!f)
+ f = mon.get_quorum_con_features();
+ if (!f)
+ f = -1;
+ bufferlist full_bl;
+ osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
+ tx_size += full_bl.length();
+
+ bufferlist orig_full_bl;
+ get_version_full(osdmap.epoch, orig_full_bl);
+ dout(20) << __func__ << " mon is running version: " << ceph_version_to_str() << dendl;
+ if (orig_full_bl.length()) {
+ // the primary provided the full map
+ ceph_assert(inc.have_crc);
+ if (inc.full_crc != osdmap.crc) {
+ // This will happen if the mons were running mixed versions in
+ // the past or some other circumstance made the full encoded
+ // maps divergent. Reloading here will bring us back into
+ // sync with the primary for this and all future maps. OSDs
+ // will also be brought back into sync when they discover the
+ // crc mismatch and request a full map from a mon.
+ derr << __func__ << " full map CRC mismatch,"
+ << " might be because mons are running mixed versions ..."
+ << " resetting to canonical" << dendl;
+
+ dout(20) << __func__ << " canonical crc: " << inc.full_crc
+ << " my crc: " << osdmap.crc << dendl;
+
+ dout(20) << __func__ << " my (bad) full osdmap:\n";
+ JSONFormatter jf(true);
+ jf.dump_object("osdmap", osdmap);
+ jf.flush(*_dout);
+ *_dout << "\nhexdump:\n";
+ full_bl.hexdump(*_dout);
+ *_dout << dendl;
+
+ osdmap = OSDMap();
+ osdmap.decode(orig_full_bl);
+
+ dout(20) << __func__ << " canonical full osdmap:\n";
+ JSONFormatter jf(true);
+ jf.dump_object("osdmap", osdmap);
+ jf.flush(*_dout);
+ *_dout << "\nhexdump:\n";
+ orig_full_bl.hexdump(*_dout);
+ *_dout << dendl;
+ }
+ } else {
+ ceph_assert(!inc.have_crc);
+ put_version_full(t, osdmap.epoch, full_bl);
+ }
+ put_version_latest_full(t, osdmap.epoch);
+
+ // share
+ dout(1) << osdmap << dendl;
+
+ if (osdmap.epoch == 1) {
+ t->erase("mkfs", "osdmap");
+ }
+
+ if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
+ mon.store->apply_transaction(t);
+ t = MonitorDBStore::TransactionRef();
+ tx_size = 0;
+ }
+ for (auto [osd, state] : inc.new_state) {
+ if (state & CEPH_OSD_UP) {
+ // could be marked up *or* down, but we're too lazy to check which
+ last_osd_report.erase(osd);
+ }
+ }
+ for (auto [osd, weight] : inc.new_weight) {
+ if (weight == CEPH_OSD_OUT) {
+ // manually marked out, so drop it
+ osd_epochs.erase(osd);
+ }
+ }
+ }
+
+ if (t) {
+ mon.store->apply_transaction(t);
+ }
+
+ bool marked_osd_down = false;
+ for (int o = 0; o < osdmap.get_max_osd(); o++) {
+ if (osdmap.is_out(o))
+ continue;
+ auto found = down_pending_out.find(o);
+ if (osdmap.is_down(o)) {
+ // populate down -> out map
+ if (found == down_pending_out.end()) {
+ dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
+ down_pending_out[o] = ceph_clock_now();
+ marked_osd_down = true;
+ }
+ } else {
+ if (found != down_pending_out.end()) {
+ dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
+ down_pending_out.erase(found);
+ }
+ }
+ }
+ // XXX: need to trim MonSession connected with a osd whose id > max_osd?
+
+ check_osdmap_subs();
+ check_pg_creates_subs();
+
+ share_map_with_random_osd();
+ update_logger();
+ process_failures();
+
+ // make sure our feature bits reflect the latest map
+ update_msgr_features();
+
+ if (!mon.is_leader()) {
+ // will be called by on_active() on the leader, avoid doing so twice
+ start_mapping();
+ }
+ if (osdmap.stretch_mode_enabled) {
+ dout(20) << "Stretch mode enabled in this map" << dendl;
+ mon.try_engage_stretch_mode();
+ if (osdmap.degraded_stretch_mode) {
+ dout(20) << "Degraded stretch mode set in this map" << dendl;
+ if (!osdmap.recovering_stretch_mode) {
+ mon.set_degraded_stretch_mode();
+ dout(20) << "prev_num_up_osd: " << prev_num_up_osd << dendl;
+ dout(20) << "osdmap.num_up_osd: " << osdmap.num_up_osd << dendl;
+ dout(20) << "osdmap.num_osd: " << osdmap.num_osd << dendl;
+ dout(20) << "mon_stretch_cluster_recovery_ratio: " << cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio") << dendl;
+ if (prev_num_up_osd < osdmap.num_up_osd &&
+ (osdmap.num_up_osd / (double)osdmap.num_osd) >
+ cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio") &&
+ mon.dead_mon_buckets.size() == 0) {
+ // TODO: This works for 2-site clusters when the OSD maps are appropriately
+ // trimmed and everything is "normal" but not if you have a lot of out OSDs
+ // you're ignoring or in some really degenerate failure cases
+
+ dout(10) << "Enabling recovery stretch mode in this map" << dendl;
+ mon.go_recovery_stretch_mode();
+ }
+ } else {
+ mon.set_recovery_stretch_mode();
+ }
+ } else {
+ mon.set_healthy_stretch_mode();
+ }
+ if (marked_osd_down &&
+ (!osdmap.degraded_stretch_mode || osdmap.recovering_stretch_mode)) {
+ dout(20) << "Checking degraded stretch mode due to osd changes" << dendl;
+ mon.maybe_go_degraded_stretch_mode();
+ }
+ }
+}
+
+int OSDMonitor::register_cache_with_pcm()
+{
+ if (mon_memory_target <= 0 || mon_memory_min <= 0) {
+ derr << __func__ << " Invalid memory size specified for mon caches."
+ << " Caches will not be auto-tuned."
+ << dendl;
+ return -EINVAL;
+ }
+ uint64_t base = mon_memory_base;
+ double fragmentation = mon_memory_fragmentation;
+ // For calculating total target memory, consider rocksdb cache size.
+ uint64_t target = mon_memory_target;
+ uint64_t min = mon_memory_min;
+ uint64_t max = min;
+
+ // Apply the same logic as in bluestore to set the max amount
+ // of memory to use for cache. Assume base memory for OSDMaps
+ // and then add in some overhead for fragmentation.
+ uint64_t ltarget = (1.0 - fragmentation) * target;
+ if (ltarget > base + min) {
+ max = ltarget - base;
+ }
+
+ rocksdb_binned_kv_cache = mon.store->get_priority_cache();
+ if (!rocksdb_binned_kv_cache) {
+ derr << __func__ << " not using rocksdb" << dendl;
+ return -EINVAL;
+ }
+
+ int r = _set_cache_ratios();
+ if (r < 0) {
+ derr << __func__ << " Cache ratios for pcm could not be set."
+ << " Review the kv (rocksdb) and mon_memory_target sizes."
+ << dendl;
+ return -EINVAL;
+ }
+
+ pcm = std::make_shared<PriorityCache::Manager>(
+ cct, min, max, target, true);
+ pcm->insert("kv", rocksdb_binned_kv_cache, true);
+ pcm->insert("inc", inc_cache, true);
+ pcm->insert("full", full_cache, true);
+ dout(1) << __func__ << " pcm target: " << target
+ << " pcm max: " << max
+ << " pcm min: " << min
+ << " inc_osd_cache size: " << inc_osd_cache.get_size()
+ << dendl;
+ return 0;
+}
+
+int OSDMonitor::_set_cache_ratios()
+{
+ double old_cache_kv_ratio = cache_kv_ratio;
+
+ // Set the cache ratios for kv(rocksdb), inc and full caches
+ cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
+ if (cache_kv_ratio >= 1.0) {
+ derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
+ << ") must be in range [0,<1.0]."
+ << dendl;
+ cache_kv_ratio = old_cache_kv_ratio;
+ return -EINVAL;
+ }
+ rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
+ cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
+ inc_cache->set_cache_ratio(cache_inc_ratio);
+ full_cache->set_cache_ratio(cache_full_ratio);
+
+ dout(1) << __func__ << " kv ratio " << cache_kv_ratio
+ << " inc ratio " << cache_inc_ratio
+ << " full ratio " << cache_full_ratio
+ << dendl;
+ return 0;
+}
+
+void OSDMonitor::start_mapping()
+{
+ // initiate mapping job
+ if (mapping_job) {
+ dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
+ << dendl;
+ mapping_job->abort();
+ }
+ if (!osdmap.get_pools().empty()) {
+ auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
+ mapping_job = mapping.start_update(osdmap, mapper,
+ g_conf()->mon_osd_mapping_pgs_per_chunk);
+ dout(10) << __func__ << " started mapping job " << mapping_job.get()
+ << " at " << fin->start << dendl;
+ mapping_job->set_finish_event(fin);
+ } else {
+ dout(10) << __func__ << " no pools, no mapping job" << dendl;
+ mapping_job = nullptr;
+ }
+}
+
+void OSDMonitor::update_msgr_features()
+{
+ const int types[] = {
+ entity_name_t::TYPE_OSD,
+ entity_name_t::TYPE_CLIENT,
+ entity_name_t::TYPE_MDS,
+ entity_name_t::TYPE_MON
+ };
+ for (int type : types) {
+ uint64_t mask;
+ uint64_t features = osdmap.get_features(type, &mask);
+ if ((mon.messenger->get_policy(type).features_required & mask) != features) {
+ dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
+ ceph::net::Policy p = mon.messenger->get_policy(type);
+ p.features_required = (p.features_required & ~mask) | features;
+ mon.messenger->set_policy(type, p);
+ }
+ }
+}
+
+void OSDMonitor::on_active()
+{
+ update_logger();
+
+ if (mon.is_leader()) {
+ mon.clog->debug() << "osdmap " << osdmap;
+ if (!priority_convert) {
+ // Only do this once at start-up
+ convert_pool_priorities();
+ priority_convert = true;
+ }
+ } else {
+ list<MonOpRequestRef> ls;
+ take_all_failures(ls);
+ while (!ls.empty()) {
+ MonOpRequestRef op = ls.front();
+ op->mark_osdmon_event(__func__);
+ dispatch(op);
+ ls.pop_front();
+ }
+ }
+ start_mapping();
+}
+
+void OSDMonitor::on_restart()
+{
+ last_osd_report.clear();
+}
+
+void OSDMonitor::on_shutdown()
+{
+ dout(10) << __func__ << dendl;
+ if (mapping_job) {
+ dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
+ << dendl;
+ mapping_job->abort();
+ }
+
+ // discard failure info, waiters
+ list<MonOpRequestRef> ls;
+ take_all_failures(ls);
+ ls.clear();
+}
+
+void OSDMonitor::update_logger()
+{
+ dout(10) << "update_logger" << dendl;
+
+ mon.cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
+ mon.cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
+ mon.cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
+ mon.cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
+}
+
+void OSDMonitor::create_pending()
+{
+ pending_inc = OSDMap::Incremental(osdmap.epoch+1);
+ pending_inc.fsid = mon.monmap->fsid;
+ pending_metadata.clear();
+ pending_metadata_rm.clear();
+
+ dout(10) << "create_pending e " << pending_inc.epoch << dendl;
+
+ // safety checks (this shouldn't really happen)
+ {
+ if (osdmap.backfillfull_ratio <= 0) {
+ pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
+ if (pending_inc.new_backfillfull_ratio > 1.0)
+ pending_inc.new_backfillfull_ratio /= 100;
+ dout(1) << __func__ << " setting backfillfull_ratio = "
+ << pending_inc.new_backfillfull_ratio << dendl;
+ }
+ if (osdmap.full_ratio <= 0) {
+ pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
+ if (pending_inc.new_full_ratio > 1.0)
+ pending_inc.new_full_ratio /= 100;
+ dout(1) << __func__ << " setting full_ratio = "
+ << pending_inc.new_full_ratio << dendl;
+ }
+ if (osdmap.nearfull_ratio <= 0) {
+ pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
+ if (pending_inc.new_nearfull_ratio > 1.0)
+ pending_inc.new_nearfull_ratio /= 100;
+ dout(1) << __func__ << " setting nearfull_ratio = "
+ << pending_inc.new_nearfull_ratio << dendl;
+ }
+ }
+}
+
+creating_pgs_t
+OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
+ const OSDMap& nextmap)
+{
+ dout(10) << __func__ << dendl;
+ creating_pgs_t pending_creatings;
+ {
+ std::lock_guard<std::mutex> l(creating_pgs_lock);
+ pending_creatings = creating_pgs;
+ }
+ // check for new or old pools
+ if (pending_creatings.last_scan_epoch < inc.epoch) {
+ unsigned queued = 0;
+ queued += scan_for_creating_pgs(osdmap.get_pools(),
+ inc.old_pools,
+ inc.modified,
+ &pending_creatings);
+ queued += scan_for_creating_pgs(inc.new_pools,
+ inc.old_pools,
+ inc.modified,
+ &pending_creatings);
+ dout(10) << __func__ << " " << queued << " pools queued" << dendl;
+ for (auto deleted_pool : inc.old_pools) {
+ auto removed = pending_creatings.remove_pool(deleted_pool);
+ dout(10) << __func__ << " " << removed
+ << " pg removed because containing pool deleted: "
+ << deleted_pool << dendl;
+ last_epoch_clean.remove_pool(deleted_pool);
+ }
+ // pgmon updates its creating_pgs in check_osd_map() which is called by
+ // on_active() and check_osd_map() could be delayed if lease expires, so its
+ // creating_pgs could be stale in comparison with the one of osdmon. let's
+ // trim them here. otherwise, they will be added back after being erased.
+ unsigned removed = 0;
+ for (auto& pg : pending_created_pgs) {
+ dout(20) << __func__ << " noting created pg " << pg << dendl;
+ pending_creatings.created_pools.insert(pg.pool());
+ removed += pending_creatings.pgs.erase(pg);
+ }
+ pending_created_pgs.clear();
+ dout(10) << __func__ << " " << removed
+ << " pgs removed because they're created" << dendl;
+ pending_creatings.last_scan_epoch = osdmap.get_epoch();
+ }
+
+ // filter out any pgs that shouldn't exist.
+ {
+ auto i = pending_creatings.pgs.begin();
+ while (i != pending_creatings.pgs.end()) {
+ if (!nextmap.pg_exists(i->first)) {
+ dout(10) << __func__ << " removing pg " << i->first
+ << " which should not exist" << dendl;
+ i = pending_creatings.pgs.erase(i);
+ } else {
+ ++i;
+ }
+ }
+ }
+
+ // process queue
+ unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
+ const auto total = pending_creatings.pgs.size();
+ while (pending_creatings.pgs.size() < max &&
+ !pending_creatings.queue.empty()) {
+ auto p = pending_creatings.queue.begin();
+ int64_t poolid = p->first;
+ dout(10) << __func__ << " pool " << poolid
+ << " created " << p->second.created
+ << " modified " << p->second.modified
+ << " [" << p->second.start << "-" << p->second.end << ")"
+ << dendl;
+ int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
+ p->second.end - p->second.start);
+ ps_t first = p->second.start;
+ ps_t end = first + n;
+ for (ps_t ps = first; ps < end; ++ps) {
+ const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
+ // NOTE: use the *current* epoch as the PG creation epoch so that the
+ // OSD does not have to generate a long set of PastIntervals.
+ pending_creatings.pgs.emplace(
+ pgid,
+ creating_pgs_t::pg_create_info(inc.epoch,
+ p->second.modified));
+ dout(10) << __func__ << " adding " << pgid << dendl;
+ }
+ p->second.start = end;
+ if (p->second.done()) {
+ dout(10) << __func__ << " done with queue for " << poolid << dendl;
+ pending_creatings.queue.erase(p);
+ } else {
+ dout(10) << __func__ << " pool " << poolid
+ << " now [" << p->second.start << "-" << p->second.end << ")"
+ << dendl;
+ }
+ }
+ dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
+ << " pools" << dendl;
+
+ if (mon.monmap->min_mon_release >= ceph_release_t::octopus) {
+ // walk creating pgs' history and past_intervals forward
+ for (auto& i : pending_creatings.pgs) {
+ // this mirrors PG::start_peering_interval()
+ pg_t pgid = i.first;
+
+ // this is a bit imprecise, but sufficient?
+ struct min_size_predicate_t : public IsPGRecoverablePredicate {
+ const pg_pool_t *pi;
+ bool operator()(const set<pg_shard_t> &have) const {
+ return have.size() >= pi->min_size;
+ }
+ explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
+ } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
+
+ vector<int> up, acting;
+ int up_primary, acting_primary;
+ nextmap.pg_to_up_acting_osds(
+ pgid, &up, &up_primary, &acting, &acting_primary);
+ if (i.second.history.epoch_created == 0) {
+ // new pg entry, set it up
+ i.second.up = up;
+ i.second.acting = acting;
+ i.second.up_primary = up_primary;
+ i.second.acting_primary = acting_primary;
+ i.second.history = pg_history_t(i.second.create_epoch,
+ i.second.create_stamp);
+ dout(10) << __func__ << " pg " << pgid << " just added, "
+ << " up " << i.second.up
+ << " p " << i.second.up_primary
+ << " acting " << i.second.acting
+ << " p " << i.second.acting_primary
+ << " history " << i.second.history
+ << " past_intervals " << i.second.past_intervals
+ << dendl;
+ } else {
+ std::stringstream debug;
+ if (PastIntervals::check_new_interval(
+ i.second.acting_primary, acting_primary,
+ i.second.acting, acting,
+ i.second.up_primary, up_primary,
+ i.second.up, up,
+ i.second.history.same_interval_since,
+ i.second.history.last_epoch_clean,
+ &nextmap,
+ &osdmap,
+ pgid,
+ min_size_predicate,
+ &i.second.past_intervals,
+ &debug)) {
+ epoch_t e = inc.epoch;
+ i.second.history.same_interval_since = e;
+ if (i.second.up != up) {
+ i.second.history.same_up_since = e;
+ }
+ if (i.second.acting_primary != acting_primary) {
+ i.second.history.same_primary_since = e;
+ }
+ if (pgid.is_split(
+ osdmap.get_pg_num(pgid.pool()),
+ nextmap.get_pg_num(pgid.pool()),
+ nullptr)) {
+ i.second.history.last_epoch_split = e;
+ }
+ dout(10) << __func__ << " pg " << pgid << " new interval,"
+ << " up " << i.second.up << " -> " << up
+ << " p " << i.second.up_primary << " -> " << up_primary
+ << " acting " << i.second.acting << " -> " << acting
+ << " p " << i.second.acting_primary << " -> "
+ << acting_primary
+ << " history " << i.second.history
+ << " past_intervals " << i.second.past_intervals
+ << dendl;
+ dout(20) << " debug: " << debug.str() << dendl;
+ i.second.up = up;
+ i.second.acting = acting;
+ i.second.up_primary = up_primary;
+ i.second.acting_primary = acting_primary;
+ }
+ }
+ }
+ }
+ dout(10) << __func__
+ << " " << (pending_creatings.pgs.size() - total)
+ << "/" << pending_creatings.pgs.size()
+ << " pgs added from queued pools" << dendl;
+ return pending_creatings;
+}
+
+void OSDMonitor::maybe_prime_pg_temp()
+{
+ bool all = false;
+ if (pending_inc.crush.length()) {
+ dout(10) << __func__ << " new crush map, all" << dendl;
+ all = true;
+ }
+
+ if (!pending_inc.new_up_client.empty()) {
+ dout(10) << __func__ << " new up osds, all" << dendl;
+ all = true;
+ }
+
+ // check for interesting OSDs
+ set<int> osds;
+ for (auto p = pending_inc.new_state.begin();
+ !all && p != pending_inc.new_state.end();
+ ++p) {
+ if ((p->second & CEPH_OSD_UP) &&
+ osdmap.is_up(p->first)) {
+ osds.insert(p->first);
+ }
+ }
+ for (auto p = pending_inc.new_weight.begin();
+ !all && p != pending_inc.new_weight.end();
+ ++p) {
+ if (osdmap.exists(p->first) && p->second < osdmap.get_weight(p->first)) {
+ // weight reduction
+ osds.insert(p->first);
+ } else {
+ dout(10) << __func__ << " osd." << p->first << " weight increase, all"
+ << dendl;
+ all = true;
+ }
+ }
+
+ if (!all && osds.empty())
+ return;
+
+ if (!all) {
+ unsigned estimate =
+ mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
+ if (estimate > mapping.get_num_pgs() *
+ g_conf()->mon_osd_prime_pg_temp_max_estimate) {
+ dout(10) << __func__ << " estimate " << estimate << " pgs on "
+ << osds.size() << " osds >= "
+ << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
+ << mapping.get_num_pgs() << " pgs, all"
+ << dendl;
+ all = true;
+ } else {
+ dout(10) << __func__ << " estimate " << estimate << " pgs on "
+ << osds.size() << " osds" << dendl;
+ }
+ }
+
+ OSDMap next;
+ next.deepish_copy_from(osdmap);
+ next.apply_incremental(pending_inc);
+
+ if (next.get_pools().empty()) {
+ dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
+ } else if (all) {
+ PrimeTempJob job(next, this);
+ mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
+ if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
+ dout(10) << __func__ << " done in " << job.get_duration() << dendl;
+ } else {
+ dout(10) << __func__ << " did not finish in "
+ << g_conf()->mon_osd_prime_pg_temp_max_time
+ << ", stopping" << dendl;
+ job.abort();
+ }
+ } else {
+ dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
+ utime_t stop = ceph_clock_now();
+ stop += g_conf()->mon_osd_prime_pg_temp_max_time;
+ const int chunk = 1000;
+ int n = chunk;
+ std::unordered_set<pg_t> did_pgs;
+ for (auto osd : osds) {
+ auto& pgs = mapping.get_osd_acting_pgs(osd);
+ dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
+ for (auto pgid : pgs) {
+ if (!did_pgs.insert(pgid).second) {
+ continue;
+ }
+ prime_pg_temp(next, pgid);
+ if (--n <= 0) {
+ n = chunk;
+ if (ceph_clock_now() > stop) {
+ dout(10) << __func__ << " consumed more than "
+ << g_conf()->mon_osd_prime_pg_temp_max_time
+ << " seconds, stopping"
+ << dendl;
+ return;
+ }
+ }
+ }
+ }
+ }
+}
+
+void OSDMonitor::prime_pg_temp(
+ const OSDMap& next,
+ pg_t pgid)
+{
+ // TODO: remove this creating_pgs direct access?
+ if (creating_pgs.pgs.count(pgid)) {
+ return;
+ }
+ if (!osdmap.pg_exists(pgid)) {
+ return;
+ }
+
+ vector<int> up, acting;
+ mapping.get(pgid, &up, nullptr, &acting, nullptr);
+
+ vector<int> next_up, next_acting;
+ int next_up_primary, next_acting_primary;
+ next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
+ &next_acting, &next_acting_primary);
+ if (acting == next_acting &&
+ !(up != acting && next_up == next_acting))
+ return; // no change since last epoch
+
+ if (acting.empty())
+ return; // if previously empty now we can be no worse off
+ const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
+ if (pool && acting.size() < pool->min_size)
+ return; // can be no worse off than before
+
+ if (next_up == next_acting) {
+ acting.clear();
+ dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
+ << dendl;
+ }
+
+ dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
+ << " -> " << next_up << "/" << next_acting
+ << ", priming " << acting
+ << dendl;
+ {
+ std::lock_guard l(prime_pg_temp_lock);
+ // do not touch a mapping if a change is pending
+ pending_inc.new_pg_temp.emplace(
+ pgid,
+ mempool::osdmap::vector<int>(acting.begin(), acting.end()));
+ }
+}
+
+/**
+ * @note receiving a transaction in this function gives a fair amount of
+ * freedom to the service implementation if it does need it. It shouldn't.
+ */
+void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+ dout(10) << "encode_pending e " << pending_inc.epoch
+ << dendl;
+
+ if (do_prune(t)) {
+ dout(1) << __func__ << " osdmap full prune encoded e"
+ << pending_inc.epoch << dendl;
+ }
+
+ // finalize up pending_inc
+ pending_inc.modified = ceph_clock_now();
+
+ int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap);
+ ceph_assert(r == 0);
+
+ if (mapping_job) {
+ if (!mapping_job->is_done()) {
+ dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
+ << mapping_job.get() << " did not complete, "
+ << mapping_job->shards << " left" << dendl;
+ mapping_job->abort();
+ } else if (mapping.get_epoch() < osdmap.get_epoch()) {
+ dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
+ << mapping_job.get() << " is prior epoch "
+ << mapping.get_epoch() << dendl;
+ } else {
+ if (g_conf()->mon_osd_prime_pg_temp) {
+ maybe_prime_pg_temp();
+ }
+ }
+ } else if (g_conf()->mon_osd_prime_pg_temp) {
+ dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
+ << dendl;
+ }
+ mapping_job.reset();
+
+ // ensure we don't have blank new_state updates. these are interrpeted as
+ // CEPH_OSD_UP (and almost certainly not what we want!).
+ auto p = pending_inc.new_state.begin();
+ while (p != pending_inc.new_state.end()) {
+ if (p->second == 0) {
+ dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
+ p = pending_inc.new_state.erase(p);
+ } else {
+ if (p->second & CEPH_OSD_UP) {
+ pending_inc.new_last_up_change = pending_inc.modified;
+ }
+ ++p;
+ }
+ }
+ if (!pending_inc.new_up_client.empty()) {
+ pending_inc.new_last_up_change = pending_inc.modified;
+ }
+ for (auto& i : pending_inc.new_weight) {
+ if (i.first >= osdmap.max_osd) {
+ if (i.second) {
+ // new osd is already marked in
+ pending_inc.new_last_in_change = pending_inc.modified;
+ break;
+ }
+ } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
+ // existing osd marked in or out
+ pending_inc.new_last_in_change = pending_inc.modified;
+ break;
+ }
+ }
+
+ {
+ OSDMap tmp;
+ tmp.deepish_copy_from(osdmap);
+ tmp.apply_incremental(pending_inc);
+
+ // clean pg_temp mappings
+ OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
+
+ // clean inappropriate pg_upmap/pg_upmap_items (if any)
+ {
+ // check every upmapped pg for now
+ // until we could reliably identify certain cases to ignore,
+ // which is obviously the hard part TBD..
+ vector<pg_t> pgs_to_check;
+ tmp.get_upmap_pgs(&pgs_to_check);
+ if (pgs_to_check.size() <
+ static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
+ // not enough pgs, do it inline
+ tmp.clean_pg_upmaps(cct, &pending_inc);
+ } else {
+ CleanUpmapJob job(cct, tmp, pending_inc);
+ mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
+ job.wait();
+ }
+ }
+
+ // update creating pgs first so that we can remove the created pgid and
+ // process the pool flag removal below in the same osdmap epoch.
+ auto pending_creatings = update_pending_pgs(pending_inc, tmp);
+ bufferlist creatings_bl;
+ uint64_t features = CEPH_FEATURES_ALL;
+ if (mon.monmap->min_mon_release < ceph_release_t::octopus) {
+ dout(20) << __func__ << " encoding pending pgs without octopus features"
+ << dendl;
+ features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
+ }
+ encode(pending_creatings, creatings_bl, features);
+ t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
+
+ // remove any old (or incompat) POOL_CREATING flags
+ for (auto& i : tmp.get_pools()) {
+ if (tmp.require_osd_release < ceph_release_t::nautilus) {
+ // pre-nautilus OSDMaps shouldn't get this flag.
+ if (pending_inc.new_pools.count(i.first)) {
+ pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
+ }
+ }
+ if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
+ !pending_creatings.still_creating_pool(i.first)) {
+ dout(10) << __func__ << " done creating pool " << i.first
+ << ", clearing CREATING flag" << dendl;
+ if (pending_inc.new_pools.count(i.first) == 0) {
+ pending_inc.new_pools[i.first] = i.second;
+ }
+ pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
+ }
+ }
+
+ // collect which pools are currently affected by
+ // the near/backfill/full osd(s),
+ // and set per-pool near/backfill/full flag instead
+ set<int64_t> full_pool_ids;
+ set<int64_t> backfillfull_pool_ids;
+ set<int64_t> nearfull_pool_ids;
+ tmp.get_full_pools(cct,
+ &full_pool_ids,
+ &backfillfull_pool_ids,
+ &nearfull_pool_ids);
+ if (full_pool_ids.empty() ||
+ backfillfull_pool_ids.empty() ||
+ nearfull_pool_ids.empty()) {
+ // normal case - no nearfull, backfillfull or full osds
+ // try cancel any improper nearfull/backfillfull/full pool
+ // flags first
+ for (auto &pool: tmp.get_pools()) {
+ auto p = pool.first;
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
+ nearfull_pool_ids.empty()) {
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s nearfull flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ // load original pool info first!
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
+ backfillfull_pool_ids.empty()) {
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s backfillfull flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
+ full_pool_ids.empty()) {
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+ // set by EQUOTA, skipping
+ continue;
+ }
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s full flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
+ }
+ }
+ }
+ if (!full_pool_ids.empty()) {
+ dout(10) << __func__ << " marking pool(s) " << full_pool_ids
+ << " as full" << dendl;
+ for (auto &p: full_pool_ids) {
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
+ continue;
+ }
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = tmp.pools[p];
+ }
+ pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+ }
+ // cancel FLAG_FULL for pools which are no longer full too
+ for (auto &pool: tmp.get_pools()) {
+ auto p = pool.first;
+ if (full_pool_ids.count(p)) {
+ // skip pools we have just marked as full above
+ continue;
+ }
+ if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
+ tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+ // don't touch if currently is not full
+ // or is running out of quota (and hence considered as full)
+ continue;
+ }
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s full flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
+ }
+ }
+ if (!backfillfull_pool_ids.empty()) {
+ for (auto &p: backfillfull_pool_ids) {
+ if (full_pool_ids.count(p)) {
+ // skip pools we have already considered as full above
+ continue;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+ // make sure FLAG_FULL is truly set, so we are safe not
+ // to set a extra (redundant) FLAG_BACKFILLFULL flag
+ ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
+ continue;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+ // don't bother if pool is already marked as backfillfull
+ continue;
+ }
+ dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
+ << "'s as backfillfull" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = tmp.pools[p];
+ }
+ pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+ }
+ // cancel FLAG_BACKFILLFULL for pools
+ // which are no longer backfillfull too
+ for (auto &pool: tmp.get_pools()) {
+ auto p = pool.first;
+ if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
+ // skip pools we have just marked as backfillfull/full above
+ continue;
+ }
+ if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+ // and don't touch if currently is not backfillfull
+ continue;
+ }
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s backfillfull flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+ }
+ }
+ if (!nearfull_pool_ids.empty()) {
+ for (auto &p: nearfull_pool_ids) {
+ if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
+ continue;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+ // make sure FLAG_FULL is truly set, so we are safe not
+ // to set a extra (redundant) FLAG_NEARFULL flag
+ ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
+ continue;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
+ // don't bother if pool is already marked as nearfull
+ continue;
+ }
+ dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
+ << "'s as nearfull" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = tmp.pools[p];
+ }
+ pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
+ }
+ // cancel FLAG_NEARFULL for pools
+ // which are no longer nearfull too
+ for (auto &pool: tmp.get_pools()) {
+ auto p = pool.first;
+ if (full_pool_ids.count(p) ||
+ backfillfull_pool_ids.count(p) ||
+ nearfull_pool_ids.count(p)) {
+ // skip pools we have just marked as
+ // nearfull/backfillfull/full above
+ continue;
+ }
+ if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
+ // and don't touch if currently is not nearfull
+ continue;
+ }
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s nearfull flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+ }
+ }
+
+ // min_compat_client?
+ if (!tmp.require_min_compat_client) {
+ auto mv = tmp.get_min_compat_client();
+ dout(1) << __func__ << " setting require_min_compat_client to currently "
+ << "required " << mv << dendl;
+ mon.clog->info() << "setting require_min_compat_client to currently "
+ << "required " << mv;
+ pending_inc.new_require_min_compat_client = mv;
+ }
+
+ if (osdmap.require_osd_release < ceph_release_t::nautilus &&
+ tmp.require_osd_release >= ceph_release_t::nautilus) {
+ dout(10) << __func__ << " first nautilus+ epoch" << dendl;
+ // add creating flags?
+ for (auto& i : tmp.get_pools()) {
+ if (pending_creatings.still_creating_pool(i.first)) {
+ dout(10) << __func__ << " adding CREATING flag to pool " << i.first
+ << dendl;
+ if (pending_inc.new_pools.count(i.first) == 0) {
+ pending_inc.new_pools[i.first] = i.second;
+ }
+ pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
+ }
+ }
+ // adjust blocklist items to all be TYPE_ANY
+ for (auto& i : tmp.blocklist) {
+ auto a = i.first;
+ a.set_type(entity_addr_t::TYPE_ANY);
+ pending_inc.new_blocklist[a] = i.second;
+ pending_inc.old_blocklist.push_back(i.first);
+ }
+ }
+
+ if (osdmap.require_osd_release < ceph_release_t::octopus &&
+ tmp.require_osd_release >= ceph_release_t::octopus) {
+ dout(10) << __func__ << " first octopus+ epoch" << dendl;
+
+ // adjust obsoleted cache modes
+ for (auto& [poolid, pi] : tmp.pools) {
+ if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
+ if (pending_inc.new_pools.count(poolid) == 0) {
+ pending_inc.new_pools[poolid] = pi;
+ }
+ dout(10) << __func__ << " switching pool " << poolid
+ << " cachemode from forward -> proxy" << dendl;
+ pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
+ }
+ if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
+ if (pending_inc.new_pools.count(poolid) == 0) {
+ pending_inc.new_pools[poolid] = pi;
+ }
+ dout(10) << __func__ << " switching pool " << poolid
+ << " cachemode from readforward -> readproxy" << dendl;
+ pending_inc.new_pools[poolid].cache_mode =
+ pg_pool_t::CACHEMODE_READPROXY;
+ }
+ }
+
+ // clear removed_snaps for every pool
+ for (auto& [poolid, pi] : tmp.pools) {
+ if (pi.removed_snaps.empty()) {
+ continue;
+ }
+ if (pending_inc.new_pools.count(poolid) == 0) {
+ pending_inc.new_pools[poolid] = pi;
+ }
+ dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
+ << dendl;
+ pending_inc.new_pools[poolid].removed_snaps.clear();
+ }
+
+ // create a combined purged snap epoch key for all purged snaps
+ // prior to this epoch, and store it in the current epoch (i.e.,
+ // the last pre-octopus epoch, just prior to the one we're
+ // encoding now).
+ auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
+ it->lower_bound("purged_snap_");
+ map<int64_t,snap_interval_set_t> combined;
+ while (it->valid()) {
+ if (it->key().find("purged_snap_") != 0) {
+ break;
+ }
+ string k = it->key();
+ long long unsigned pool;
+ int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
+ if (n != 1) {
+ derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
+ } else {
+ bufferlist v = it->value();
+ auto p = v.cbegin();
+ snapid_t begin, end;
+ ceph::decode(begin, p);
+ ceph::decode(end, p);
+ combined[pool].insert(begin, end - begin);
+ }
+ it->next();
+ }
+ if (!combined.empty()) {
+ string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
+ bufferlist v;
+ ceph::encode(combined, v);
+ t->put(OSD_SNAP_PREFIX, k, v);
+ dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
+ << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
+ << dendl;
+ } else {
+ dout(10) << __func__ << " there were no pre-octopus purged snaps"
+ << dendl;
+ }
+
+ // clean out the old removed_snap_ and removed_epoch keys
+ // ('`' is ASCII '_' + 1)
+ t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
+ t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
+ }
+ }
+
+ // tell me about it
+ for (auto i = pending_inc.new_state.begin();
+ i != pending_inc.new_state.end();
+ ++i) {
+ int s = i->second ? i->second : CEPH_OSD_UP;
+ if (s & CEPH_OSD_UP) {
+ dout(2) << " osd." << i->first << " DOWN" << dendl;
+ // Reset laggy parameters if failure interval exceeds a threshold.
+ const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
+ if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
+ int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
+ if (grace_interval_threshold_exceeded(last_failure_interval)) {
+ set_default_laggy_params(i->first);
+ }
+ }
+ }
+ if (s & CEPH_OSD_EXISTS)
+ dout(2) << " osd." << i->first << " DNE" << dendl;
+ }
+ for (auto i = pending_inc.new_up_client.begin();
+ i != pending_inc.new_up_client.end();
+ ++i) {
+ //FIXME: insert cluster addresses too
+ dout(2) << " osd." << i->first << " UP " << i->second << dendl;
+ }
+ for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
+ i != pending_inc.new_weight.end();
+ ++i) {
+ if (i->second == CEPH_OSD_OUT) {
+ dout(2) << " osd." << i->first << " OUT" << dendl;
+ } else if (i->second == CEPH_OSD_IN) {
+ dout(2) << " osd." << i->first << " IN" << dendl;
+ } else {
+ dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
+ }
+ }
+
+ // features for osdmap and its incremental
+ uint64_t features;
+
+ // encode full map and determine its crc
+ OSDMap tmp;
+ {
+ tmp.deepish_copy_from(osdmap);
+ tmp.apply_incremental(pending_inc);
+
+ // determine appropriate features
+ features = tmp.get_encoding_features();
+ dout(10) << __func__ << " encoding full map with "
+ << tmp.require_osd_release
+ << " features " << features << dendl;
+
+ // the features should be a subset of the mon quorum's features!
+ ceph_assert((features & ~mon.get_quorum_con_features()) == 0);
+
+ bufferlist fullbl;
+ encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
+ pending_inc.full_crc = tmp.get_crc();
+
+ // include full map in the txn. note that old monitors will
+ // overwrite this. new ones will now skip the local full map
+ // encode and reload from this.
+ put_version_full(t, pending_inc.epoch, fullbl);
+ }
+
+ // encode
+ ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
+ bufferlist bl;
+ encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
+
+ dout(20) << __func__ << " mon is running version: "
+ << ceph_version_to_str() << dendl;
+ dout(20) << " full_crc " << tmp.get_crc()
+ << " inc_crc " << pending_inc.inc_crc << dendl;
+
+ /* put everything in the transaction */
+ put_version(t, pending_inc.epoch, bl);
+ put_last_committed(t, pending_inc.epoch);
+
+ // metadata, too!
+ for (map<int,bufferlist>::iterator p = pending_metadata.begin();
+ p != pending_metadata.end();
+ ++p) {
+ Metadata m;
+ auto mp = p->second.cbegin();
+ decode(m, mp);
+ t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
+ }
+ for (set<int>::iterator p = pending_metadata_rm.begin();
+ p != pending_metadata_rm.end();
+ ++p) {
+ t->erase(OSD_METADATA_PREFIX, stringify(*p));
+ }
+ pending_metadata.clear();
+ pending_metadata_rm.clear();
+
+ // purged_snaps
+ if (tmp.require_osd_release >= ceph_release_t::octopus &&
+ !pending_inc.new_purged_snaps.empty()) {
+ // all snaps purged this epoch (across all pools)
+ string k = make_purged_snap_epoch_key(pending_inc.epoch);
+ bufferlist v;
+ encode(pending_inc.new_purged_snaps, v);
+ t->put(OSD_SNAP_PREFIX, k, v);
+ }
+ for (auto& i : pending_inc.new_purged_snaps) {
+ for (auto q = i.second.begin();
+ q != i.second.end();
+ ++q) {
+ insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
+ pending_inc.epoch,
+ t);
+ }
+ }
+
+ // health
+ health_check_map_t next;
+ tmp.check_health(cct, &next);
+ encode_health(next, t);
+}
+
+int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
+{
+ bufferlist bl;
+ int r = mon.store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
+ if (r < 0)
+ return r;
+ try {
+ auto p = bl.cbegin();
+ decode(m, p);
+ }
+ catch (ceph::buffer::error& e) {
+ if (err)
+ *err << "osd." << osd << " metadata is corrupt";
+ return -EIO;
+ }
+ return 0;
+}
+
+void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
+{
+ for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
+ if (osdmap.is_up(osd)) {
+ map<string,string> meta;
+ load_metadata(osd, meta, nullptr);
+ auto p = meta.find(field);
+ if (p == meta.end()) {
+ (*out)["unknown"]++;
+ } else {
+ (*out)[p->second]++;
+ }
+ }
+ }
+}
+
+void OSDMonitor::count_metadata(const string& field, Formatter *f)
+{
+ map<string,int> by_val;
+ count_metadata(field, &by_val);
+ f->open_object_section(field.c_str());
+ for (auto& p : by_val) {
+ f->dump_int(p.first.c_str(), p.second);
+ }
+ f->close_section();
+}
+
+void OSDMonitor::get_versions(std::map<string, list<string>> &versions)
+{
+ for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
+ if (osdmap.is_up(osd)) {
+ map<string,string> meta;
+ load_metadata(osd, meta, nullptr);
+ auto p = meta.find("ceph_version_short");
+ if (p == meta.end()) continue;
+ versions[p->second].push_back(string("osd.") + stringify(osd));
+ }
+ }
+}
+
+int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
+{
+ map<string, string> metadata;
+ int r = load_metadata(osd, metadata, nullptr);
+ if (r < 0)
+ return r;
+
+ auto it = metadata.find("osd_objectstore");
+ if (it == metadata.end())
+ return -ENOENT;
+ *type = it->second;
+ return 0;
+}
+
+
+bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
+ const pg_pool_t &pool,
+ ostream *err)
+{
+ // just check a few pgs for efficiency - this can't give a guarantee anyway,
+ // since filestore osds could always join the pool later
+ set<int> checked_osds;
+ for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
+ vector<int> up, acting;
+ pg_t pgid(ps, pool_id);
+ osdmap.pg_to_up_acting_osds(pgid, up, acting);
+ for (int osd : up) {
+ if (checked_osds.find(osd) != checked_osds.end())
+ continue;
+ string objectstore_type;
+ int r = get_osd_objectstore_type(osd, &objectstore_type);
+ // allow with missing metadata, e.g. due to an osd never booting yet
+ if (r < 0 || objectstore_type == "bluestore") {
+ checked_osds.insert(osd);
+ continue;
+ }
+ *err << "osd." << osd << " uses " << objectstore_type;
+ return false;
+ }
+ }
+ return true;
+}
+
+int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
+{
+ map<string,string> m;
+ if (int r = load_metadata(osd, m, err))
+ return r;
+ for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
+ f->dump_string(p->first.c_str(), p->second);
+ return 0;
+}
+
+void OSDMonitor::print_nodes(Formatter *f)
+{
+ // group OSDs by their hosts
+ map<string, list<int> > osds; // hostname => osd
+ for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
+ map<string, string> m;
+ if (load_metadata(osd, m, NULL)) {
+ continue;
+ }
+ map<string, string>::iterator hostname = m.find("hostname");
+ if (hostname == m.end()) {
+ // not likely though
+ continue;
+ }
+ osds[hostname->second].push_back(osd);
+ }
+
+ dump_services(f, osds, "osd");
+}
+
+void OSDMonitor::share_map_with_random_osd()
+{
+ if (osdmap.get_num_up_osds() == 0) {
+ dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
+ return;
+ }
+
+ MonSession *s = mon.session_map.get_random_osd_session(&osdmap);
+ if (!s) {
+ dout(10) << __func__ << " no up osd on our session map" << dendl;
+ return;
+ }
+
+ dout(10) << "committed, telling random " << s->name
+ << " all about it" << dendl;
+
+ // get feature of the peer
+ // use quorum_con_features, if it's an anonymous connection.
+ uint64_t features = s->con_features ? s->con_features :
+ mon.get_quorum_con_features();
+ // whatev, they'll request more if they need it
+ MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
+ s->con->send_message(m);
+ // NOTE: do *not* record osd has up to this epoch (as we do
+ // elsewhere) as they may still need to request older values.
+}
+
+version_t OSDMonitor::get_trim_to() const
+{
+ if (mon.get_quorum().empty()) {
+ dout(10) << __func__ << " quorum not formed, trim_to = 0" << dendl;
+ return 0;
+ }
+
+ {
+ std::lock_guard<std::mutex> l(creating_pgs_lock);
+ if (!creating_pgs.pgs.empty()) {
+ dout(10) << __func__ << " pgs creating, trim_to = 0" << dendl;
+ return 0;
+ }
+ }
+
+ if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
+ dout(0) << __func__
+ << " blocking osdmap trim"
+ << " ('mon_debug_block_osdmap_trim' set to 'true')"
+ << " trim_to = 0" << dendl;
+ return 0;
+ }
+
+ {
+ epoch_t floor = get_min_last_epoch_clean();
+ dout(10) << " min_last_epoch_clean " << floor << dendl;
+ if (g_conf()->mon_osd_force_trim_to > 0 &&
+ g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
+ floor = g_conf()->mon_osd_force_trim_to;
+ dout(10) << __func__
+ << " explicit mon_osd_force_trim_to = " << floor << dendl;
+ }
+ unsigned min = g_conf()->mon_min_osdmap_epochs;
+ if (floor + min > get_last_committed()) {
+ if (min < get_last_committed())
+ floor = get_last_committed() - min;
+ else
+ floor = 0;
+ }
+ if (floor > get_first_committed()) {
+ dout(10) << __func__ << " trim_to = " << floor << dendl;
+ return floor;
+ }
+ }
+ dout(10) << __func__ << " trim_to = 0" << dendl;
+ return 0;
+}
+
+epoch_t OSDMonitor::get_min_last_epoch_clean() const
+{
+ auto floor = last_epoch_clean.get_lower_bound(osdmap);
+ // also scan osd epochs
+ // don't trim past the oldest reported osd epoch
+ for (auto [osd, epoch] : osd_epochs) {
+ if (epoch < floor) {
+ floor = epoch;
+ }
+ }
+ return floor;
+}
+
+void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
+ version_t first)
+{
+ dout(10) << __func__ << " including full map for e " << first << dendl;
+ bufferlist bl;
+ get_version_full(first, bl);
+ put_version_full(tx, first, bl);
+
+ if (has_osdmap_manifest &&
+ first > osdmap_manifest.get_first_pinned()) {
+ _prune_update_trimmed(tx, first);
+ }
+}
+
+
+/* full osdmap prune
+ *
+ * for more information, please refer to doc/dev/mon-osdmap-prune.rst
+ */
+
+void OSDMonitor::load_osdmap_manifest()
+{
+ bool store_has_manifest =
+ mon.store->exists(get_service_name(), "osdmap_manifest");
+
+ if (!store_has_manifest) {
+ if (!has_osdmap_manifest) {
+ return;
+ }
+
+ dout(20) << __func__
+ << " dropping osdmap manifest from memory." << dendl;
+ osdmap_manifest = osdmap_manifest_t();
+ has_osdmap_manifest = false;
+ return;
+ }
+
+ dout(20) << __func__
+ << " osdmap manifest detected in store; reload." << dendl;
+
+ bufferlist manifest_bl;
+ int r = get_value("osdmap_manifest", manifest_bl);
+ if (r < 0) {
+ derr << __func__ << " unable to read osdmap version manifest" << dendl;
+ ceph_abort_msg("error reading manifest");
+ }
+ osdmap_manifest.decode(manifest_bl);
+ has_osdmap_manifest = true;
+
+ dout(10) << __func__ << " store osdmap manifest pinned ("
+ << osdmap_manifest.get_first_pinned()
+ << " .. "
+ << osdmap_manifest.get_last_pinned()
+ << ")"
+ << dendl;
+}
+
+bool OSDMonitor::should_prune() const
+{
+ version_t first = get_first_committed();
+ version_t last = get_last_committed();
+ version_t min_osdmap_epochs =
+ g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
+ version_t prune_min =
+ g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
+ version_t prune_interval =
+ g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
+ version_t last_pinned = osdmap_manifest.get_last_pinned();
+ version_t last_to_pin = last - min_osdmap_epochs;
+
+ // Make it or break it constraints.
+ //
+ // If any of these conditions fails, we will not prune, regardless of
+ // whether we have an on-disk manifest with an on-going pruning state.
+ //
+ if ((last - first) <= min_osdmap_epochs) {
+ // between the first and last committed epochs, we don't have
+ // enough epochs to trim, much less to prune.
+ dout(10) << __func__
+ << " currently holding only " << (last - first)
+ << " epochs (min osdmap epochs: " << min_osdmap_epochs
+ << "); do not prune."
+ << dendl;
+ return false;
+
+ } else if ((last_to_pin - first) < prune_min) {
+ // between the first committed epoch and the last epoch we would prune,
+ // we simply don't have enough versions over the minimum to prune maps.
+ dout(10) << __func__
+ << " could only prune " << (last_to_pin - first)
+ << " epochs (" << first << ".." << last_to_pin << "), which"
+ " is less than the required minimum (" << prune_min << ")"
+ << dendl;
+ return false;
+
+ } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
+ dout(10) << __func__
+ << " we have pruned as far as we can; do not prune."
+ << dendl;
+ return false;
+
+ } else if (last_pinned + prune_interval > last_to_pin) {
+ dout(10) << __func__
+ << " not enough epochs to form an interval (last pinned: "
+ << last_pinned << ", last to pin: "
+ << last_to_pin << ", interval: " << prune_interval << ")"
+ << dendl;
+ return false;
+ }
+
+ dout(15) << __func__
+ << " should prune (" << last_pinned << ".." << last_to_pin << ")"
+ << " lc (" << first << ".." << last << ")"
+ << dendl;
+ return true;
+}
+
+void OSDMonitor::_prune_update_trimmed(
+ MonitorDBStore::TransactionRef tx,
+ version_t first)
+{
+ dout(10) << __func__
+ << " first " << first
+ << " last_pinned " << osdmap_manifest.get_last_pinned()
+ << dendl;
+
+ osdmap_manifest_t manifest = osdmap_manifest;
+
+ if (!manifest.is_pinned(first)) {
+ manifest.pin(first);
+ }
+
+ set<version_t>::iterator p_end = manifest.pinned.find(first);
+ set<version_t>::iterator p = manifest.pinned.begin();
+ manifest.pinned.erase(p, p_end);
+ ceph_assert(manifest.get_first_pinned() == first);
+
+ if (manifest.get_last_pinned() == first+1 ||
+ manifest.pinned.size() == 1) {
+ // we reached the end of the line, as pinned maps go; clean up our
+ // manifest, and let `should_prune()` decide whether we should prune
+ // again.
+ tx->erase(get_service_name(), "osdmap_manifest");
+ return;
+ }
+
+ bufferlist bl;
+ manifest.encode(bl);
+ tx->put(get_service_name(), "osdmap_manifest", bl);
+}
+
+void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
+{
+ dout(1) << __func__ << dendl;
+
+ version_t pin_first;
+
+ // verify constrainsts on stable in-memory state
+ if (!has_osdmap_manifest) {
+ // we must have never pruned, OR if we pruned the state must no longer
+ // be relevant (i.e., the state must have been removed alongside with
+ // the trim that *must* have removed past the last pinned map in a
+ // previous prune).
+ ceph_assert(osdmap_manifest.pinned.empty());
+ ceph_assert(!mon.store->exists(get_service_name(), "osdmap_manifest"));
+ pin_first = get_first_committed();
+
+ } else {
+ // we must have pruned in the past AND its state is still relevant
+ // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
+ // and thus we still hold a manifest in the store).
+ ceph_assert(!osdmap_manifest.pinned.empty());
+ ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
+ ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
+
+ dout(10) << __func__
+ << " first_pinned " << osdmap_manifest.get_first_pinned()
+ << " last_pinned " << osdmap_manifest.get_last_pinned()
+ << dendl;
+
+ pin_first = osdmap_manifest.get_last_pinned();
+ }
+
+ manifest.pin(pin_first);
+}
+
+bool OSDMonitor::_prune_sanitize_options() const
+{
+ uint64_t prune_interval =
+ g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
+ uint64_t prune_min =
+ g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
+ uint64_t txsize =
+ g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
+
+ bool r = true;
+
+ if (prune_interval == 0) {
+ derr << __func__
+ << " prune is enabled BUT prune interval is zero; abort."
+ << dendl;
+ r = false;
+ } else if (prune_interval == 1) {
+ derr << __func__
+ << " prune interval is equal to one, which essentially means"
+ " no pruning; abort."
+ << dendl;
+ r = false;
+ }
+ if (prune_min == 0) {
+ derr << __func__
+ << " prune is enabled BUT prune min is zero; abort."
+ << dendl;
+ r = false;
+ }
+ if (prune_interval > prune_min) {
+ derr << __func__
+ << " impossible to ascertain proper prune interval because"
+ << " it is greater than the minimum prune epochs"
+ << " (min: " << prune_min << ", interval: " << prune_interval << ")"
+ << dendl;
+ r = false;
+ }
+
+ if (txsize < prune_interval - 1) {
+ derr << __func__
+ << " 'mon_osdmap_full_prune_txsize' (" << txsize
+ << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
+ << "); abort." << dendl;
+ r = false;
+ }
+ return r;
+}
+
+bool OSDMonitor::is_prune_enabled() const {
+ return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
+}
+
+bool OSDMonitor::is_prune_supported() const {
+ return mon.get_required_mon_features().contains_any(
+ ceph::features::mon::FEATURE_OSDMAP_PRUNE);
+}
+
+/** do_prune
+ *
+ * @returns true if has side-effects; false otherwise.
+ */
+bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
+{
+ bool enabled = is_prune_enabled();
+
+ dout(1) << __func__ << " osdmap full prune "
+ << ( enabled ? "enabled" : "disabled")
+ << dendl;
+
+ if (!enabled || !_prune_sanitize_options() || !should_prune()) {
+ return false;
+ }
+
+ // we are beyond the minimum prune versions, we need to remove maps because
+ // otherwise the store will grow unbounded and we may end up having issues
+ // with available disk space or store hangs.
+
+ // we will not pin all versions. We will leave a buffer number of versions.
+ // this allows us the monitor to trim maps without caring too much about
+ // pinned maps, and then allow us to use another ceph-mon without these
+ // capabilities, without having to repair the store.
+
+ osdmap_manifest_t manifest = osdmap_manifest;
+
+ version_t first = get_first_committed();
+ version_t last = get_last_committed();
+
+ version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
+ version_t last_pinned = manifest.get_last_pinned();
+ uint64_t prune_interval =
+ g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
+ uint64_t txsize =
+ g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
+
+ prune_init(manifest);
+
+ // we need to get rid of some osdmaps
+
+ dout(5) << __func__
+ << " lc (" << first << " .. " << last << ")"
+ << " last_pinned " << last_pinned
+ << " interval " << prune_interval
+ << " last_to_pin " << last_to_pin
+ << dendl;
+
+ // We will be erasing maps as we go.
+ //
+ // We will erase all maps between `last_pinned` and the `next_to_pin`.
+ //
+ // If `next_to_pin` happens to be greater than `last_to_pin`, then
+ // we stop pruning. We could prune the maps between `next_to_pin` and
+ // `last_to_pin`, but by not doing it we end up with neater pruned
+ // intervals, aligned with `prune_interval`. Besides, this should not be a
+ // problem as long as `prune_interval` is set to a sane value, instead of
+ // hundreds or thousands of maps.
+
+ auto map_exists = [this](version_t v) {
+ string k = mon.store->combine_strings("full", v);
+ return mon.store->exists(get_service_name(), k);
+ };
+
+ // 'interval' represents the number of maps from the last pinned
+ // i.e., if we pinned version 1 and have an interval of 10, we're pinning
+ // version 11 next; all intermediate versions will be removed.
+ //
+ // 'txsize' represents the maximum number of versions we'll be removing in
+ // this iteration. If 'txsize' is large enough to perform multiple passes
+ // pinning and removing maps, we will do so; if not, we'll do at least one
+ // pass. We are quite relaxed about honouring 'txsize', but we'll always
+ // ensure that we never go *over* the maximum.
+
+ // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
+ uint64_t removal_interval = prune_interval - 1;
+
+ if (txsize < removal_interval) {
+ dout(5) << __func__
+ << " setting txsize to removal interval size ("
+ << removal_interval << " versions"
+ << dendl;
+ txsize = removal_interval;
+ }
+ ceph_assert(removal_interval > 0);
+
+ uint64_t num_pruned = 0;
+ while (num_pruned + removal_interval <= txsize) {
+ last_pinned = manifest.get_last_pinned();
+
+ if (last_pinned + prune_interval > last_to_pin) {
+ break;
+ }
+ ceph_assert(last_pinned < last_to_pin);
+
+ version_t next_pinned = last_pinned + prune_interval;
+ ceph_assert(next_pinned <= last_to_pin);
+ manifest.pin(next_pinned);
+
+ dout(20) << __func__
+ << " last_pinned " << last_pinned
+ << " next_pinned " << next_pinned
+ << " num_pruned " << num_pruned
+ << " removal interval (" << (last_pinned+1)
+ << ".." << (next_pinned-1) << ")"
+ << " txsize " << txsize << dendl;
+
+ ceph_assert(map_exists(last_pinned));
+ ceph_assert(map_exists(next_pinned));
+
+ for (version_t v = last_pinned+1; v < next_pinned; ++v) {
+ ceph_assert(!manifest.is_pinned(v));
+
+ dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
+ string full_key = mon.store->combine_strings("full", v);
+ tx->erase(get_service_name(), full_key);
+ ++num_pruned;
+ }
+ }
+
+ ceph_assert(num_pruned > 0);
+
+ bufferlist bl;
+ manifest.encode(bl);
+ tx->put(get_service_name(), "osdmap_manifest", bl);
+
+ return true;
+}
+
+
+// -------------
+
+bool OSDMonitor::preprocess_query(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ Message *m = op->get_req();
+ dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
+
+ switch (m->get_type()) {
+ // READs
+ case MSG_MON_COMMAND:
+ try {
+ return preprocess_command(op);
+ } catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
+ case CEPH_MSG_MON_GET_OSDMAP:
+ return preprocess_get_osdmap(op);
+
+ // damp updates
+ case MSG_OSD_MARK_ME_DOWN:
+ return preprocess_mark_me_down(op);
+ case MSG_OSD_MARK_ME_DEAD:
+ return preprocess_mark_me_dead(op);
+ case MSG_OSD_FULL:
+ return preprocess_full(op);
+ case MSG_OSD_FAILURE:
+ return preprocess_failure(op);
+ case MSG_OSD_BOOT:
+ return preprocess_boot(op);
+ case MSG_OSD_ALIVE:
+ return preprocess_alive(op);
+ case MSG_OSD_PG_CREATED:
+ return preprocess_pg_created(op);
+ case MSG_OSD_PG_READY_TO_MERGE:
+ return preprocess_pg_ready_to_merge(op);
+ case MSG_OSD_PGTEMP:
+ return preprocess_pgtemp(op);
+ case MSG_OSD_BEACON:
+ return preprocess_beacon(op);
+
+ case CEPH_MSG_POOLOP:
+ return preprocess_pool_op(op);
+
+ case MSG_REMOVE_SNAPS:
+ return preprocess_remove_snaps(op);
+
+ case MSG_MON_GET_PURGED_SNAPS:
+ return preprocess_get_purged_snaps(op);
+
+ default:
+ ceph_abort();
+ return false;
+ }
+}
+
+bool OSDMonitor::prepare_update(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ Message *m = op->get_req();
+ dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
+
+ switch (m->get_type()) {
+ // damp updates
+ case MSG_OSD_MARK_ME_DOWN:
+ return prepare_mark_me_down(op);
+ case MSG_OSD_MARK_ME_DEAD:
+ return prepare_mark_me_dead(op);
+ case MSG_OSD_FULL:
+ return prepare_full(op);
+ case MSG_OSD_FAILURE:
+ return prepare_failure(op);
+ case MSG_OSD_BOOT:
+ return prepare_boot(op);
+ case MSG_OSD_ALIVE:
+ return prepare_alive(op);
+ case MSG_OSD_PG_CREATED:
+ return prepare_pg_created(op);
+ case MSG_OSD_PGTEMP:
+ return prepare_pgtemp(op);
+ case MSG_OSD_PG_READY_TO_MERGE:
+ return prepare_pg_ready_to_merge(op);
+ case MSG_OSD_BEACON:
+ return prepare_beacon(op);
+
+ case MSG_MON_COMMAND:
+ try {
+ return prepare_command(op);
+ } catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return false; /* nothing to propose */
+ }
+
+ case CEPH_MSG_POOLOP:
+ return prepare_pool_op(op);
+
+ case MSG_REMOVE_SNAPS:
+ return prepare_remove_snaps(op);
+
+
+ default:
+ ceph_abort();
+ }
+
+ return false;
+}
+
+bool OSDMonitor::should_propose(double& delay)
+{
+ dout(10) << "should_propose" << dendl;
+
+ // if full map, propose immediately! any subsequent changes will be clobbered.
+ if (pending_inc.fullmap.length())
+ return true;
+
+ // adjust osd weights?
+ if (!osd_weight.empty() &&
+ osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
+ dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
+ osdmap.adjust_osd_weights(osd_weight, pending_inc);
+ delay = 0.0;
+ osd_weight.clear();
+ return true;
+ }
+
+ return PaxosService::should_propose(delay);
+}
+
+
+
+// ---------------------------
+// READs
+
+bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MMonGetOSDMap>();
+
+ uint64_t features = mon.get_quorum_con_features();
+ if (op->get_session() && op->get_session()->con_features)
+ features = op->get_session()->con_features;
+
+ dout(10) << __func__ << " " << *m << dendl;
+ MOSDMap *reply = new MOSDMap(mon.monmap->fsid, features);
+ epoch_t first = get_first_committed();
+ epoch_t last = osdmap.get_epoch();
+ int max = g_conf()->osd_map_message_max;
+ ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
+ for (epoch_t e = std::max(first, m->get_full_first());
+ e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
+ ++e, --max) {
+ bufferlist& bl = reply->maps[e];
+ int r = get_version_full(e, features, bl);
+ ceph_assert(r >= 0);
+ max_bytes -= bl.length();
+ }
+ for (epoch_t e = std::max(first, m->get_inc_first());
+ e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
+ ++e, --max) {
+ bufferlist& bl = reply->incremental_maps[e];
+ int r = get_version(e, features, bl);
+ ceph_assert(r >= 0);
+ max_bytes -= bl.length();
+ }
+ reply->cluster_osdmap_trim_lower_bound = first;
+ reply->newest_map = last;
+ mon.send_reply(op, reply);
+ return true;
+}
+
+
+// ---------------------------
+// UPDATEs
+
+// failure --
+
+bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
+ // check permissions
+ MonSession *session = op->get_session();
+ if (!session)
+ return true;
+ if (!session->is_capable("osd", MON_CAP_X)) {
+ dout(0) << "got MOSDFailure from entity with insufficient caps "
+ << session->caps << dendl;
+ return true;
+ }
+ if (fsid != mon.monmap->fsid) {
+ dout(0) << "check_source: on fsid " << fsid
+ << " != " << mon.monmap->fsid << dendl;
+ return true;
+ }
+ return false;
+}
+
+
+bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDFailure>();
+ // who is target_osd
+ int badboy = m->get_target_osd();
+
+ // check permissions
+ if (check_source(op, m->fsid))
+ goto didit;
+
+ // first, verify the reporting host is valid
+ if (m->get_orig_source().is_osd()) {
+ int from = m->get_orig_source().num();
+ if (!osdmap.exists(from) ||
+ !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
+ (osdmap.is_down(from) && m->if_osd_failed())) {
+ dout(5) << "preprocess_failure from dead osd." << from
+ << ", ignoring" << dendl;
+ send_incremental(op, m->get_epoch()+1);
+ goto didit;
+ }
+ }
+
+
+ // weird?
+ if (osdmap.is_down(badboy)) {
+ dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
+ << " " << m->get_target_addrs()
+ << ", from " << m->get_orig_source() << dendl;
+ if (m->get_epoch() < osdmap.get_epoch())
+ send_incremental(op, m->get_epoch()+1);
+ goto didit;
+ }
+ if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
+ dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
+ << " " << m->get_target_addrs()
+ << " != map's " << osdmap.get_addrs(badboy)
+ << ", from " << m->get_orig_source() << dendl;
+ if (m->get_epoch() < osdmap.get_epoch())
+ send_incremental(op, m->get_epoch()+1);
+ goto didit;
+ }
+
+ // already reported?
+ if (osdmap.is_down(badboy) ||
+ osdmap.get_up_from(badboy) > m->get_epoch()) {
+ dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
+ << " " << m->get_target_addrs()
+ << ", from " << m->get_orig_source() << dendl;
+ if (m->get_epoch() < osdmap.get_epoch())
+ send_incremental(op, m->get_epoch()+1);
+ goto didit;
+ }
+
+ if (!can_mark_down(badboy)) {
+ dout(5) << "preprocess_failure ignoring report of osd."
+ << m->get_target_osd() << " " << m->get_target_addrs()
+ << " from " << m->get_orig_source() << dendl;
+ goto didit;
+ }
+
+ dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
+ << " " << m->get_target_addrs()
+ << ", from " << m->get_orig_source() << dendl;
+ return false;
+
+ didit:
+ mon.no_reply(op);
+ return true;
+}
+
+class C_AckMarkedDown : public C_MonOp {
+ OSDMonitor *osdmon;
+public:
+ C_AckMarkedDown(
+ OSDMonitor *osdmon,
+ MonOpRequestRef op)
+ : C_MonOp(op), osdmon(osdmon) {}
+
+ void _finish(int r) override {
+ if (r == 0) {
+ auto m = op->get_req<MOSDMarkMeDown>();
+ osdmon->mon.send_reply(
+ op,
+ new MOSDMarkMeDown(
+ m->fsid,
+ m->target_osd,
+ m->target_addrs,
+ m->get_epoch(),
+ false)); // ACK itself does not request an ack
+ } else if (r == -EAGAIN) {
+ osdmon->dispatch(op);
+ } else {
+ ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
+ }
+ }
+ ~C_AckMarkedDown() override {
+ }
+};
+
+bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDMarkMeDown>();
+ int from = m->target_osd;
+
+ // check permissions
+ if (check_source(op, m->fsid))
+ goto reply;
+
+ // first, verify the reporting host is valid
+ if (!m->get_orig_source().is_osd())
+ goto reply;
+
+ if (!osdmap.exists(from) ||
+ osdmap.is_down(from) ||
+ osdmap.get_addrs(from) != m->target_addrs) {
+ dout(5) << "preprocess_mark_me_down from dead osd."
+ << from << ", ignoring" << dendl;
+ send_incremental(op, m->get_epoch()+1);
+ goto reply;
+ }
+
+ // no down might be set
+ if (!can_mark_down(from))
+ goto reply;
+
+ dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
+ << " " << m->target_addrs << dendl;
+ return false;
+
+ reply:
+ if (m->request_ack) {
+ Context *c(new C_AckMarkedDown(this, op));
+ c->complete(0);
+ }
+ return true;
+}
+
+bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDMarkMeDown>();
+ int target_osd = m->target_osd;
+
+ ceph_assert(osdmap.is_up(target_osd));
+ ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
+
+ mon.clog->info() << "osd." << target_osd << " marked itself " << ((m->down_and_dead) ? "down and dead" : "down");
+ pending_inc.new_state[target_osd] = CEPH_OSD_UP;
+ if (m->down_and_dead) {
+ if (!pending_inc.new_xinfo.count(target_osd)) {
+ pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
+ }
+ pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
+ }
+ if (m->request_ack)
+ wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
+ return true;
+}
+
+bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDMarkMeDead>();
+ int from = m->target_osd;
+
+ // check permissions
+ if (check_source(op, m->fsid)) {
+ mon.no_reply(op);
+ return true;
+ }
+
+ // first, verify the reporting host is valid
+ if (!m->get_orig_source().is_osd()) {
+ mon.no_reply(op);
+ return true;
+ }
+
+ if (!osdmap.exists(from) ||
+ !osdmap.is_down(from)) {
+ dout(5) << __func__ << " from nonexistent or up osd." << from
+ << ", ignoring" << dendl;
+ send_incremental(op, m->get_epoch()+1);
+ mon.no_reply(op);
+ return true;
+ }
+
+ return false;
+}
+
+bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDMarkMeDead>();
+ int target_osd = m->target_osd;
+
+ ceph_assert(osdmap.is_down(target_osd));
+
+ mon.clog->info() << "osd." << target_osd << " marked itself dead as of e"
+ << m->get_epoch();
+ if (!pending_inc.new_xinfo.count(target_osd)) {
+ pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
+ }
+ pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
+ wait_for_commit(
+ op,
+ new LambdaContext(
+ [op, this] (int r) {
+ if (r >= 0) {
+ mon.no_reply(op); // ignore on success
+ }
+ }
+ ));
+ return true;
+}
+
+bool OSDMonitor::can_mark_down(int i)
+{
+ if (osdmap.is_nodown(i)) {
+ dout(5) << __func__ << " osd." << i << " is marked as nodown, "
+ << "will not mark it down" << dendl;
+ return false;
+ }
+
+ int num_osds = osdmap.get_num_osds();
+ if (num_osds == 0) {
+ dout(5) << __func__ << " no osds" << dendl;
+ return false;
+ }
+ int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
+ float up_ratio = (float)up / (float)num_osds;
+ if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
+ dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
+ << g_conf()->mon_osd_min_up_ratio
+ << ", will not mark osd." << i << " down" << dendl;
+ return false;
+ }
+ return true;
+}
+
+bool OSDMonitor::can_mark_up(int i)
+{
+ if (osdmap.is_noup(i)) {
+ dout(5) << __func__ << " osd." << i << " is marked as noup, "
+ << "will not mark it up" << dendl;
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * @note the parameter @p i apparently only exists here so we can output the
+ * osd's id on messages.
+ */
+bool OSDMonitor::can_mark_out(int i)
+{
+ if (osdmap.is_noout(i)) {
+ dout(5) << __func__ << " osd." << i << " is marked as noout, "
+ << "will not mark it out" << dendl;
+ return false;
+ }
+
+ int num_osds = osdmap.get_num_osds();
+ if (num_osds == 0) {
+ dout(5) << __func__ << " no osds" << dendl;
+ return false;
+ }
+ int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
+ float in_ratio = (float)in / (float)num_osds;
+ if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
+ if (i >= 0)
+ dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
+ << g_conf()->mon_osd_min_in_ratio
+ << ", will not mark osd." << i << " out" << dendl;
+ else
+ dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
+ << g_conf()->mon_osd_min_in_ratio
+ << ", will not mark osds out" << dendl;
+ return false;
+ }
+
+ return true;
+}
+
+bool OSDMonitor::can_mark_in(int i)
+{
+ if (osdmap.is_noin(i)) {
+ dout(5) << __func__ << " osd." << i << " is marked as noin, "
+ << "will not mark it in" << dendl;
+ return false;
+ }
+
+ return true;
+}
+
+bool OSDMonitor::check_failures(utime_t now)
+{
+ bool found_failure = false;
+ auto p = failure_info.begin();
+ while (p != failure_info.end()) {
+ auto& [target_osd, fi] = *p;
+ if (can_mark_down(target_osd) &&
+ check_failure(now, target_osd, fi)) {
+ found_failure = true;
+ ++p;
+ } else if (is_failure_stale(now, fi)) {
+ dout(10) << " dropping stale failure_info for osd." << target_osd
+ << " from " << fi.reporters.size() << " reporters"
+ << dendl;
+ p = failure_info.erase(p);
+ } else {
+ ++p;
+ }
+ }
+ return found_failure;
+}
+
+utime_t OSDMonitor::get_grace_time(utime_t now,
+ int target_osd,
+ failure_info_t& fi) const
+{
+ utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
+ if (!g_conf()->mon_osd_adjust_heartbeat_grace) {
+ return orig_grace;
+ }
+ utime_t grace = orig_grace;
+ double halflife = (double)g_conf()->mon_osd_laggy_halflife;
+ double decay_k = ::log(.5) / halflife;
+
+ // scale grace period based on historical probability of 'lagginess'
+ // (false positive failures due to slowness).
+ const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
+ const utime_t failed_for = now - fi.get_failed_since();
+ double decay = exp((double)failed_for * decay_k);
+ dout(20) << " halflife " << halflife << " decay_k " << decay_k
+ << " failed_for " << failed_for << " decay " << decay << dendl;
+ double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
+ grace += my_grace;
+
+ // consider the peers reporting a failure a proxy for a potential
+ // 'subcluster' over the overall cluster that is similarly
+ // laggy. this is clearly not true in all cases, but will sometimes
+ // help us localize the grace correction to a subset of the system
+ // (say, a rack with a bad switch) that is unhappy.
+ double peer_grace = 0;
+ for (auto& [reporter, report] : fi.reporters) {
+ if (osdmap.exists(reporter)) {
+ const osd_xinfo_t& xi = osdmap.get_xinfo(reporter);
+ utime_t elapsed = now - xi.down_stamp;
+ double decay = exp((double)elapsed * decay_k);
+ peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
+ }
+ }
+ peer_grace /= (double)fi.reporters.size();
+ grace += peer_grace;
+ dout(10) << " osd." << target_osd << " has "
+ << fi.reporters.size() << " reporters, "
+ << grace << " grace (" << orig_grace << " + " << my_grace
+ << " + " << peer_grace << "), max_failed_since " << fi.get_failed_since()
+ << dendl;
+
+ return grace;
+}
+
+bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
+{
+ // already pending failure?
+ if (pending_inc.new_state.count(target_osd) &&
+ pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
+ dout(10) << " already pending failure" << dendl;
+ return true;
+ }
+
+ set<string> reporters_by_subtree;
+ auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
+ ceph_assert(fi.reporters.size());
+ for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
+ // get the parent bucket whose type matches with "reporter_subtree_level".
+ // fall back to OSD if the level doesn't exist.
+ if (osdmap.exists(p->first)) {
+ auto reporter_loc = osdmap.crush->get_full_location(p->first);
+ if (auto iter = reporter_loc.find(reporter_subtree_level);
+ iter == reporter_loc.end()) {
+ reporters_by_subtree.insert("osd." + to_string(p->first));
+ } else {
+ reporters_by_subtree.insert(iter->second);
+ }
+ ++p;
+ } else {
+ fi.cancel_report(p->first);;
+ p = fi.reporters.erase(p);
+ }
+ }
+ if (reporters_by_subtree.size() < g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
+ return false;
+ }
+ const utime_t failed_for = now - fi.get_failed_since();
+ const utime_t grace = get_grace_time(now, target_osd, fi);
+ if (failed_for >= grace) {
+ dout(1) << " we have enough reporters to mark osd." << target_osd
+ << " down" << dendl;
+ pending_inc.new_state[target_osd] = CEPH_OSD_UP;
+
+ mon.clog->info() << "osd." << target_osd << " failed ("
+ << osdmap.crush->get_full_location_ordered_string(
+ target_osd)
+ << ") ("
+ << (int)reporters_by_subtree.size()
+ << " reporters from different "
+ << reporter_subtree_level << " after "
+ << failed_for << " >= grace " << grace << ")";
+ return true;
+ }
+ return false;
+}
+
+bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
+{
+ // if it takes too long to either cancel the report to mark the osd down,
+ // some reporters must have failed to cancel their reports. let's just
+ // forget these reports.
+ const utime_t failed_for = now - fi.get_failed_since();
+ auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
+ auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
+ return failed_for >= (heartbeat_grace + heartbeat_stale);
+}
+
+void OSDMonitor::force_failure(int target_osd, int by)
+{
+ // already pending failure?
+ if (pending_inc.new_state.count(target_osd) &&
+ pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
+ dout(10) << " already pending failure" << dendl;
+ return;
+ }
+
+ dout(1) << " we're forcing failure of osd." << target_osd << dendl;
+ pending_inc.new_state[target_osd] = CEPH_OSD_UP;
+ if (!pending_inc.new_xinfo.count(target_osd)) {
+ pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
+ }
+ pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
+
+ mon.clog->info() << "osd." << target_osd << " failed ("
+ << osdmap.crush->get_full_location_ordered_string(target_osd)
+ << ") (connection refused reported by osd." << by << ")";
+ return;
+}
+
+bool OSDMonitor::prepare_failure(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDFailure>();
+ dout(1) << "prepare_failure osd." << m->get_target_osd()
+ << " " << m->get_target_addrs()
+ << " from " << m->get_orig_source()
+ << " is reporting failure:" << m->if_osd_failed() << dendl;
+
+ int target_osd = m->get_target_osd();
+ int reporter = m->get_orig_source().num();
+ ceph_assert(osdmap.is_up(target_osd));
+ ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
+
+ mon.no_reply(op);
+
+ if (m->if_osd_failed()) {
+ // calculate failure time
+ utime_t now = ceph_clock_now();
+ utime_t failed_since =
+ m->get_recv_stamp() - utime_t(m->failed_for, 0);
+
+ // add a report
+ if (m->is_immediate()) {
+ mon.clog->debug() << "osd." << m->get_target_osd()
+ << " reported immediately failed by "
+ << m->get_orig_source();
+ force_failure(target_osd, reporter);
+ return true;
+ }
+ mon.clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
+ << m->get_orig_source();
+
+ failure_info_t& fi = failure_info[target_osd];
+ fi.add_report(reporter, failed_since, op);
+ return check_failure(now, target_osd, fi);
+ } else {
+ // remove the report
+ mon.clog->debug() << "osd." << m->get_target_osd()
+ << " failure report canceled by "
+ << m->get_orig_source();
+ if (failure_info.count(target_osd)) {
+ failure_info_t& fi = failure_info[target_osd];
+ fi.cancel_report(reporter);
+ if (fi.reporters.empty()) {
+ dout(10) << " removing last failure_info for osd." << target_osd
+ << dendl;
+ failure_info.erase(target_osd);
+ } else {
+ dout(10) << " failure_info for osd." << target_osd << " now "
+ << fi.reporters.size() << " reporters" << dendl;
+ }
+ } else {
+ dout(10) << " no failure_info for osd." << target_osd << dendl;
+ }
+ }
+
+ return false;
+}
+
+void OSDMonitor::process_failures()
+{
+ map<int,failure_info_t>::iterator p = failure_info.begin();
+ while (p != failure_info.end()) {
+ if (osdmap.is_up(p->first)) {
+ ++p;
+ } else {
+ dout(10) << "process_failures osd." << p->first << dendl;
+ list<MonOpRequestRef> ls;
+ p->second.take_report_messages(ls);
+ failure_info.erase(p++);
+
+ while (!ls.empty()) {
+ MonOpRequestRef o = ls.front();
+ if (o) {
+ o->mark_event(__func__);
+ MOSDFailure *m = o->get_req<MOSDFailure>();
+ send_latest(o, m->get_epoch());
+ mon.no_reply(o);
+ }
+ ls.pop_front();
+ }
+ }
+ }
+}
+
+void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
+{
+ dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
+
+ for (map<int,failure_info_t>::iterator p = failure_info.begin();
+ p != failure_info.end();
+ ++p) {
+ p->second.take_report_messages(ls);
+ }
+ failure_info.clear();
+}
+
+int OSDMonitor::get_grace_interval_threshold()
+{
+ int halflife = g_conf()->mon_osd_laggy_halflife;
+ // Scale the halflife period (default: 1_hr) by
+ // a factor (48) to calculate the threshold.
+ int grace_threshold_factor = 48;
+ return halflife * grace_threshold_factor;
+}
+
+bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
+{
+ int grace_interval_threshold_secs = get_grace_interval_threshold();
+ if (last_failed_interval > grace_interval_threshold_secs) {
+ dout(1) << " last_failed_interval " << last_failed_interval
+ << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
+ << dendl;
+ return true;
+ }
+ return false;
+}
+
+void OSDMonitor::set_default_laggy_params(int target_osd)
+{
+ if (pending_inc.new_xinfo.count(target_osd) == 0) {
+ pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
+ }
+ osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
+ xi.down_stamp = pending_inc.modified;
+ xi.laggy_probability = 0.0;
+ xi.laggy_interval = 0;
+ dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
+}
+
+
+// boot --
+
+bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDBoot>();
+ int from = m->get_orig_source_inst().name.num();
+
+ // check permissions, ignore if failed (no response expected)
+ MonSession *session = op->get_session();
+ if (!session)
+ goto ignore;
+ if (!session->is_capable("osd", MON_CAP_X)) {
+ dout(0) << "got preprocess_boot message from entity with insufficient caps"
+ << session->caps << dendl;
+ goto ignore;
+ }
+
+ if (m->sb.cluster_fsid != mon.monmap->fsid) {
+ dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
+ << " != " << mon.monmap->fsid << dendl;
+ goto ignore;
+ }
+
+ if (m->get_orig_source_inst().addr.is_blank_ip()) {
+ dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
+ goto ignore;
+ }
+
+ ceph_assert(m->get_orig_source_inst().name.is_osd());
+
+ // lower bound of N-2
+ if (!HAVE_FEATURE(m->osd_features, SERVER_QUINCY)) {
+ mon.clog->info() << "disallowing boot of OSD "
+ << m->get_orig_source_inst()
+ << " because the osd lacks CEPH_FEATURE_SERVER_QUINCY";
+ goto ignore;
+ }
+
+ // make sure osd versions do not span more than 3 releases
+ if (HAVE_FEATURE(m->osd_features, SERVER_REEF) &&
+ osdmap.require_osd_release < ceph_release_t::pacific) {
+ mon.clog->info() << "disallowing boot of reef+ OSD "
+ << m->get_orig_source_inst()
+ << " because require_osd_release < pacific";
+ goto ignore;
+ }
+ if (HAVE_FEATURE(m->osd_features, SERVER_SQUID) &&
+ osdmap.require_osd_release < ceph_release_t::quincy) {
+ mon.clog->info() << "disallowing boot of squid+ OSD "
+ << m->get_orig_source_inst()
+ << " because require_osd_release < quincy";
+ goto ignore;
+ }
+
+ // See crimson/osd/osd.cc: OSD::_send_boot
+ if (auto type_iter = m->metadata.find("osd_type");
+ type_iter != m->metadata.end()) {
+ const auto &otype = type_iter->second;
+ // m->metadata["osd_type"] must be "crimson", classic doesn't send osd_type
+ if (otype == "crimson") {
+ if (!osdmap.get_allow_crimson()) {
+ mon.clog->info()
+ << "Disallowing boot of crimson-osd without allow_crimson "
+ << "OSDMap flag. Run ceph osd set-allow-crimson to set "
+ << "allow_crimson flag. Note that crimson-osd is "
+ << "considered unstable and may result in crashes or "
+ << "data loss. Its usage should be restricted to "
+ << "testing and development.";
+ goto ignore;
+ }
+ } else {
+ derr << __func__ << ": osd " << m->get_orig_source_inst()
+ << " sent non-crimson osd_type field in MOSDBoot: "
+ << otype
+ << " -- booting anyway"
+ << dendl;
+ }
+ }
+
+ if (osdmap.stretch_mode_enabled &&
+ !(m->osd_features & CEPH_FEATUREMASK_STRETCH_MODE)) {
+ mon.clog->info() << "disallowing boot of OSD "
+ << m->get_orig_source_inst()
+ << " because stretch mode is on and OSD lacks support";
+ goto ignore;
+ }
+
+ // already booted?
+ if (osdmap.is_up(from) &&
+ osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
+ osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
+ // yup.
+ dout(7) << "preprocess_boot dup from " << m->get_orig_source()
+ << " " << m->get_orig_source_addrs()
+ << " =~ " << osdmap.get_addrs(from) << dendl;
+ _booted(op, false);
+ return true;
+ }
+
+ if (osdmap.exists(from) &&
+ !osdmap.get_uuid(from).is_zero() &&
+ osdmap.get_uuid(from) != m->sb.osd_fsid) {
+ dout(7) << __func__ << " from " << m->get_orig_source_inst()
+ << " clashes with existing osd: different fsid"
+ << " (ours: " << osdmap.get_uuid(from)
+ << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
+ goto ignore;
+ }
+
+ if (osdmap.exists(from) &&
+ osdmap.get_info(from).up_from > m->version &&
+ osdmap.get_most_recent_addrs(from).legacy_equals(
+ m->get_orig_source_addrs())) {
+ dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
+ send_latest(op, m->sb.current_epoch+1);
+ return true;
+ }
+
+ // noup?
+ if (!can_mark_up(from)) {
+ dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
+ send_latest(op, m->sb.current_epoch+1);
+ return true;
+ }
+
+ dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
+ return false;
+
+ ignore:
+ return true;
+}
+
+bool OSDMonitor::prepare_boot(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDBoot>();
+ dout(7) << __func__ << " from " << m->get_source()
+ << " sb " << m->sb
+ << " client_addrs" << m->get_connection()->get_peer_addrs()
+ << " cluster_addrs " << m->cluster_addrs
+ << " hb_back_addrs " << m->hb_back_addrs
+ << " hb_front_addrs " << m->hb_front_addrs
+ << dendl;
+
+ ceph_assert(m->get_orig_source().is_osd());
+ int from = m->get_orig_source().num();
+
+ // does this osd exist?
+ if (from >= osdmap.get_max_osd()) {
+ dout(1) << "boot from osd." << from << " >= max_osd "
+ << osdmap.get_max_osd() << dendl;
+ return false;
+ }
+
+ int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
+ if (pending_inc.new_state.count(from))
+ oldstate ^= pending_inc.new_state[from];
+
+ // already up? mark down first?
+ if (osdmap.is_up(from)) {
+ dout(7) << __func__ << " was up, first marking down osd." << from << " "
+ << osdmap.get_addrs(from) << dendl;
+ // preprocess should have caught these; if not, assert.
+ ceph_assert(!osdmap.get_addrs(from).legacy_equals(
+ m->get_orig_source_addrs()) ||
+ !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
+ ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
+
+ if (pending_inc.new_state.count(from) == 0 ||
+ (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
+ // mark previous guy down
+ pending_inc.new_state[from] = CEPH_OSD_UP;
+ }
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ } else if (pending_inc.new_up_client.count(from)) {
+ // already prepared, just wait
+ dout(7) << __func__ << " already prepared, waiting on "
+ << m->get_orig_source_addr() << dendl;
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ } else {
+ // mark new guy up.
+ pending_inc.new_up_client[from] = m->get_orig_source_addrs();
+ pending_inc.new_up_cluster[from] = m->cluster_addrs;
+ pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
+ pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
+
+ down_pending_out.erase(from); // if any
+
+ if (m->sb.weight)
+ osd_weight[from] = m->sb.weight;
+
+ // set uuid?
+ dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
+ << dendl;
+ if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
+ // preprocess should have caught this; if not, assert.
+ ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
+ pending_inc.new_uuid[from] = m->sb.osd_fsid;
+ }
+
+ // fresh osd?
+ if (m->sb.get_newest_map() == 0 && osdmap.exists(from)) {
+ const osd_info_t& i = osdmap.get_info(from);
+ if (i.up_from > i.lost_at) {
+ dout(10) << " fresh osd; marking lost_at too" << dendl;
+ pending_inc.new_lost[from] = osdmap.get_epoch();
+ }
+ }
+
+ // metadata
+ bufferlist osd_metadata;
+ encode(m->metadata, osd_metadata);
+ pending_metadata[from] = osd_metadata;
+ pending_metadata_rm.erase(from);
+
+ // adjust last clean unmount epoch?
+ const osd_info_t& info = osdmap.get_info(from);
+ dout(10) << " old osd_info: " << info << dendl;
+ if (m->sb.mounted > info.last_clean_begin ||
+ (m->sb.mounted == info.last_clean_begin &&
+ m->sb.clean_thru > info.last_clean_end)) {
+ epoch_t begin = m->sb.mounted;
+ epoch_t end = m->sb.clean_thru;
+
+ dout(10) << __func__ << " osd." << from << " last_clean_interval "
+ << "[" << info.last_clean_begin << "," << info.last_clean_end
+ << ") -> [" << begin << "-" << end << ")"
+ << dendl;
+ pending_inc.new_last_clean_interval[from] =
+ pair<epoch_t,epoch_t>(begin, end);
+ }
+
+ if (pending_inc.new_xinfo.count(from) == 0)
+ pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
+ osd_xinfo_t& xi = pending_inc.new_xinfo[from];
+ if (m->boot_epoch == 0) {
+ xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
+ xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
+ dout(10) << " not laggy, new xi " << xi << dendl;
+ } else {
+ if (xi.down_stamp.sec()) {
+ int interval = ceph_clock_now().sec() -
+ xi.down_stamp.sec();
+ if (g_conf()->mon_osd_laggy_max_interval &&
+ (interval > g_conf()->mon_osd_laggy_max_interval)) {
+ interval = g_conf()->mon_osd_laggy_max_interval;
+ }
+ xi.laggy_interval =
+ interval * g_conf()->mon_osd_laggy_weight +
+ xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
+ }
+ xi.laggy_probability =
+ g_conf()->mon_osd_laggy_weight +
+ xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
+ dout(10) << " laggy, now xi " << xi << dendl;
+ }
+
+ // set features shared by the osd
+ if (m->osd_features)
+ xi.features = m->osd_features;
+ else
+ xi.features = m->get_connection()->get_features();
+
+ // mark in?
+ if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
+ (oldstate & CEPH_OSD_AUTOOUT)) ||
+ (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
+ (g_conf()->mon_osd_auto_mark_in)) {
+ if (can_mark_in(from)) {
+ if (xi.old_weight > 0) {
+ pending_inc.new_weight[from] = xi.old_weight;
+ xi.old_weight = 0;
+ } else {
+ pending_inc.new_weight[from] = CEPH_OSD_IN;
+ }
+ } else {
+ dout(7) << __func__ << " NOIN set, will not mark in "
+ << m->get_orig_source_addr() << dendl;
+ }
+ }
+
+ // wait
+ wait_for_finished_proposal(op, new C_Booted(this, op));
+ }
+ return true;
+}
+
+void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDBoot>();
+ dout(7) << "_booted " << m->get_orig_source_inst()
+ << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
+
+ if (logit) {
+ mon.clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
+ << " boot";
+ }
+
+ send_latest(op, m->sb.current_epoch+1);
+}
+
+
+// -------------
+// full
+
+bool OSDMonitor::preprocess_full(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDFull>();
+ int from = m->get_orig_source().num();
+ set<string> state;
+ unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
+
+ // check permissions, ignore if failed
+ MonSession *session = op->get_session();
+ if (!session)
+ goto ignore;
+ if (!session->is_capable("osd", MON_CAP_X)) {
+ dout(0) << "MOSDFull from entity with insufficient privileges:"
+ << session->caps << dendl;
+ goto ignore;
+ }
+
+ // ignore a full message from the osd instance that already went down
+ if (!osdmap.exists(from)) {
+ dout(7) << __func__ << " ignoring full message from nonexistent "
+ << m->get_orig_source_inst() << dendl;
+ goto ignore;
+ }
+ if ((!osdmap.is_up(from) &&
+ osdmap.get_most_recent_addrs(from).legacy_equals(
+ m->get_orig_source_addrs())) ||
+ (osdmap.is_up(from) &&
+ !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
+ dout(7) << __func__ << " ignoring full message from down "
+ << m->get_orig_source_inst() << dendl;
+ goto ignore;
+ }
+
+ OSDMap::calc_state_set(osdmap.get_state(from), state);
+
+ if ((osdmap.get_state(from) & mask) == m->state) {
+ dout(7) << __func__ << " state already " << state << " for osd." << from
+ << " " << m->get_orig_source_inst() << dendl;
+ _reply_map(op, m->version);
+ goto ignore;
+ }
+
+ dout(10) << __func__ << " want state " << state << " for osd." << from
+ << " " << m->get_orig_source_inst() << dendl;
+ return false;
+
+ ignore:
+ return true;
+}
+
+bool OSDMonitor::prepare_full(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDFull>();
+ const int from = m->get_orig_source().num();
+
+ const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
+ const unsigned want_state = m->state & mask; // safety first
+
+ unsigned cur_state = osdmap.get_state(from);
+ auto p = pending_inc.new_state.find(from);
+ if (p != pending_inc.new_state.end()) {
+ cur_state ^= p->second;
+ }
+ cur_state &= mask;
+
+ set<string> want_state_set, cur_state_set;
+ OSDMap::calc_state_set(want_state, want_state_set);
+ OSDMap::calc_state_set(cur_state, cur_state_set);
+
+ if (cur_state != want_state) {
+ if (p != pending_inc.new_state.end()) {
+ p->second &= ~mask;
+ } else {
+ pending_inc.new_state[from] = 0;
+ }
+ pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
+ dout(7) << __func__ << " osd." << from << " " << cur_state_set
+ << " -> " << want_state_set << dendl;
+ } else {
+ dout(7) << __func__ << " osd." << from << " " << cur_state_set
+ << " = wanted " << want_state_set << ", just waiting" << dendl;
+ }
+
+ wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
+ return true;
+}
+
+// -------------
+// alive
+
+bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDAlive>();
+ int from = m->get_orig_source().num();
+
+ // check permissions, ignore if failed
+ MonSession *session = op->get_session();
+ if (!session)
+ goto ignore;
+ if (!session->is_capable("osd", MON_CAP_X)) {
+ dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
+ << session->caps << dendl;
+ goto ignore;
+ }
+
+ if (!osdmap.is_up(from) ||
+ !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
+ dout(7) << "preprocess_alive ignoring alive message from down "
+ << m->get_orig_source() << " " << m->get_orig_source_addrs()
+ << dendl;
+ goto ignore;
+ }
+
+ if (osdmap.get_up_thru(from) >= m->want) {
+ // yup.
+ dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
+ _reply_map(op, m->version);
+ return true;
+ }
+
+ dout(10) << "preprocess_alive want up_thru " << m->want
+ << " from " << m->get_orig_source_inst() << dendl;
+ return false;
+
+ ignore:
+ return true;
+}
+
+bool OSDMonitor::prepare_alive(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDAlive>();
+ int from = m->get_orig_source().num();
+
+ if (0) { // we probably don't care much about these
+ mon.clog->debug() << m->get_orig_source_inst() << " alive";
+ }
+
+ dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
+ << " from " << m->get_orig_source_inst() << dendl;
+
+ update_up_thru(from, m->version); // set to the latest map the OSD has
+ wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
+ return true;
+}
+
+void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
+{
+ op->mark_osdmon_event(__func__);
+ dout(7) << "_reply_map " << e
+ << " from " << op->get_req()->get_orig_source_inst()
+ << dendl;
+ send_latest(op, e);
+}
+
+// pg_created
+bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDPGCreated>();
+ dout(10) << __func__ << " " << *m << dendl;
+ auto session = op->get_session();
+ mon.no_reply(op);
+ if (!session) {
+ dout(10) << __func__ << ": no monitor session!" << dendl;
+ return true;
+ }
+ if (!session->is_capable("osd", MON_CAP_X)) {
+ derr << __func__ << " received from entity "
+ << "with insufficient privileges " << session->caps << dendl;
+ return true;
+ }
+ // always forward the "created!" to the leader
+ return false;
+}
+
+bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDPGCreated>();
+ dout(10) << __func__ << " " << *m << dendl;
+ auto src = m->get_orig_source();
+ auto from = src.num();
+ if (!src.is_osd() ||
+ !mon.osdmon()->osdmap.is_up(from) ||
+ !mon.osdmon()->osdmap.get_addrs(from).legacy_equals(
+ m->get_orig_source_addrs())) {
+ dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
+ return false;
+ }
+ pending_created_pgs.push_back(m->pgid);
+ return true;
+}
+
+bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDPGReadyToMerge>();
+ dout(10) << __func__ << " " << *m << dendl;
+ const pg_pool_t *pi;
+ auto session = op->get_session();
+ if (!session) {
+ dout(10) << __func__ << ": no monitor session!" << dendl;
+ goto ignore;
+ }
+ if (!session->is_capable("osd", MON_CAP_X)) {
+ derr << __func__ << " received from entity "
+ << "with insufficient privileges " << session->caps << dendl;
+ goto ignore;
+ }
+ pi = osdmap.get_pg_pool(m->pgid.pool());
+ if (!pi) {
+ derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
+ goto ignore;
+ }
+ if (pi->get_pg_num() <= m->pgid.ps()) {
+ dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
+ goto ignore;
+ }
+ if (pi->get_pg_num() != m->pgid.ps() + 1) {
+ derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
+ goto ignore;
+ }
+ if (pi->get_pg_num_pending() > m->pgid.ps()) {
+ dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
+ goto ignore;
+ }
+ return false;
+
+ ignore:
+ mon.no_reply(op);
+ return true;
+}
+
+bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDPGReadyToMerge>();
+ dout(10) << __func__ << " " << *m << dendl;
+ pg_pool_t p;
+ if (pending_inc.new_pools.count(m->pgid.pool()))
+ p = pending_inc.new_pools[m->pgid.pool()];
+ else
+ p = *osdmap.get_pg_pool(m->pgid.pool());
+ if (p.get_pg_num() != m->pgid.ps() + 1 ||
+ p.get_pg_num_pending() > m->pgid.ps()) {
+ dout(10) << __func__
+ << " race with concurrent pg_num[_pending] update, will retry"
+ << dendl;
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return false; /* nothing to propose, yet */
+ }
+
+ if (m->ready) {
+ p.dec_pg_num(m->pgid,
+ pending_inc.epoch,
+ m->source_version,
+ m->target_version,
+ m->last_epoch_started,
+ m->last_epoch_clean);
+ p.last_change = pending_inc.epoch;
+ } else {
+ // back off the merge attempt!
+ p.set_pg_num_pending(p.get_pg_num());
+ }
+
+ // force pre-nautilus clients to resend their ops, since they
+ // don't understand pg_num_pending changes form a new interval
+ p.last_force_op_resend_prenautilus = pending_inc.epoch;
+
+ pending_inc.new_pools[m->pgid.pool()] = p;
+
+ auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
+ if (m->ready &&
+ prob > 0 &&
+ prob > (double)(rand() % 1000)/1000.0) {
+ derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
+ auto n = new MMonCommand(mon.monmap->get_fsid());
+ n->set_connection(m->get_connection());
+ n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
+ osdmap.get_pool_name(m->pgid.pool()) +
+ "\", \"var\": \"pg_num_actual\", \"val\": \"" +
+ stringify(m->pgid.ps() + 1) + "\"}" };
+ MonOpRequestRef nop = mon.op_tracker.create_request<MonOpRequest>(n);
+ nop->set_type_service();
+ wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
+ } else {
+ wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
+ }
+ return true;
+}
+
+
+// -------------
+// pg_temp changes
+
+bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
+{
+ auto m = op->get_req<MOSDPGTemp>();
+ dout(10) << "preprocess_pgtemp " << *m << dendl;
+ mempool::osdmap::vector<int> empty;
+ int from = m->get_orig_source().num();
+ size_t ignore_cnt = 0;
+
+ // check caps
+ MonSession *session = op->get_session();
+ if (!session)
+ goto ignore;
+ if (!session->is_capable("osd", MON_CAP_X)) {
+ dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
+ << session->caps << dendl;
+ goto ignore;
+ }
+
+ if (!osdmap.is_up(from) ||
+ !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
+ dout(7) << "ignoring pgtemp message from down "
+ << m->get_orig_source() << " " << m->get_orig_source_addrs()
+ << dendl;
+ goto ignore;
+ }
+
+ if (m->forced) {
+ return false;
+ }
+
+ for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
+ dout(20) << " " << p->first
+ << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
+ << " -> " << p->second << dendl;
+
+ // does the pool exist?
+ if (!osdmap.have_pg_pool(p->first.pool())) {
+ /*
+ * 1. If the osdmap does not have the pool, it means the pool has been
+ * removed in-between the osd sending this message and us handling it.
+ * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
+ * not exist in the pending either, as the osds would not send a
+ * message about a pool they know nothing about (yet).
+ * 3. However, if the pool does exist in the pending, then it must be a
+ * new pool, and not relevant to this message (see 1).
+ */
+ dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
+ << ": pool has been removed" << dendl;
+ ignore_cnt++;
+ continue;
+ }
+
+ int acting_primary = -1;
+ osdmap.pg_to_up_acting_osds(
+ p->first, nullptr, nullptr, nullptr, &acting_primary);
+ if (acting_primary != from) {
+ /* If the source isn't the primary based on the current osdmap, we know
+ * that the interval changed and that we can discard this message.
+ * Indeed, we must do so to avoid 16127 since we can't otherwise determine
+ * which of two pg temp mappings on the same pg is more recent.
+ */
+ dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
+ << ": primary has changed" << dendl;
+ ignore_cnt++;
+ continue;
+ }
+
+ // removal?
+ if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
+ osdmap.primary_temp->count(p->first)))
+ return false;
+ // change?
+ // NOTE: we assume that this will clear pg_primary, so consider
+ // an existing pg_primary field to imply a change
+ if (p->second.size() &&
+ (osdmap.pg_temp->count(p->first) == 0 ||
+ osdmap.pg_temp->get(p->first) != p->second ||
+ osdmap.primary_temp->count(p->first)))
+ return false;
+ }
+
+ // should we ignore all the pgs?
+ if (ignore_cnt == m->pg_temp.size())
+ goto ignore;
+
+ dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
+ _reply_map(op, m->map_epoch);
+ return true;
+
+ ignore:
+ mon.no_reply(op);
+ return true;
+}
+
+void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
+{
+ epoch_t old_up_thru = osdmap.get_up_thru(from);
+ auto ut = pending_inc.new_up_thru.find(from);
+ if (ut != pending_inc.new_up_thru.end()) {
+ old_up_thru = ut->second;
+ }
+ if (up_thru > old_up_thru) {
+ // set up_thru too, so the osd doesn't have to ask again
+ pending_inc.new_up_thru[from] = up_thru;
+ }
+}
+
+bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDPGTemp>();
+ int from = m->get_orig_source().num();
+ dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
+ for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
+ uint64_t pool = p->first.pool();
+ if (pending_inc.old_pools.count(pool)) {
+ dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
+ << ": pool pending removal" << dendl;
+ continue;
+ }
+ if (!osdmap.have_pg_pool(pool)) {
+ dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
+ << ": pool has been removed" << dendl;
+ continue;
+ }
+ pending_inc.new_pg_temp[p->first] =
+ mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
+
+ // unconditionally clear pg_primary (until this message can encode
+ // a change for that, too.. at which point we need to also fix
+ // preprocess_pg_temp)
+ if (osdmap.primary_temp->count(p->first) ||
+ pending_inc.new_primary_temp.count(p->first))
+ pending_inc.new_primary_temp[p->first] = -1;
+ }
+
+ // set up_thru too, so the osd doesn't have to ask again
+ update_up_thru(from, m->map_epoch);
+
+ wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
+ return true;
+}
+
+
+// ---
+
+bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MRemoveSnaps>();
+ dout(7) << "preprocess_remove_snaps " << *m << dendl;
+
+ // check privilege, ignore if failed
+ MonSession *session = op->get_session();
+ mon.no_reply(op);
+ if (!session)
+ goto ignore;
+ if (!session->caps.is_capable(
+ cct,
+ session->entity_name,
+ "osd", "osd pool rmsnap", {}, true, true, false,
+ session->get_peer_socket_addr())) {
+ dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
+ << session->caps << dendl;
+ goto ignore;
+ }
+
+ for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
+ q != m->snaps.end();
+ ++q) {
+ if (!osdmap.have_pg_pool(q->first)) {
+ dout(10) << " ignoring removed_snaps " << q->second
+ << " on non-existent pool " << q->first << dendl;
+ continue;
+ }
+ const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
+ for (vector<snapid_t>::iterator p = q->second.begin();
+ p != q->second.end();
+ ++p) {
+ if (*p > pi->get_snap_seq() ||
+ !_is_removed_snap(q->first, *p)) {
+ return false;
+ }
+ }
+ }
+
+ if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
+ auto reply = make_message<MRemoveSnaps>();
+ reply->snaps = m->snaps;
+ mon.send_reply(op, reply.detach());
+ }
+
+ ignore:
+ return true;
+}
+
+bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MRemoveSnaps>();
+ dout(7) << "prepare_remove_snaps " << *m << dendl;
+
+ for (auto& [pool, snaps] : m->snaps) {
+ if (!osdmap.have_pg_pool(pool)) {
+ dout(10) << " ignoring removed_snaps " << snaps
+ << " on non-existent pool " << pool << dendl;
+ continue;
+ }
+
+ pg_pool_t& pi = osdmap.pools[pool];
+ for (auto s : snaps) {
+ if (!_is_removed_snap(pool, s) &&
+ (!pending_inc.new_pools.count(pool) ||
+ !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
+ (!pending_inc.new_removed_snaps.count(pool) ||
+ !pending_inc.new_removed_snaps[pool].contains(s))) {
+ pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
+ if (osdmap.require_osd_release < ceph_release_t::octopus) {
+ newpi->removed_snaps.insert(s);
+ dout(10) << " pool " << pool << " removed_snaps added " << s
+ << " (now " << newpi->removed_snaps << ")" << dendl;
+ }
+ newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
+ if (s > newpi->get_snap_seq()) {
+ dout(10) << " pool " << pool << " snap_seq "
+ << newpi->get_snap_seq() << " -> " << s << dendl;
+ newpi->set_snap_seq(s);
+ }
+ newpi->set_snap_epoch(pending_inc.epoch);
+ dout(10) << " added pool " << pool << " snap " << s
+ << " to removed_snaps queue" << dendl;
+ pending_inc.new_removed_snaps[pool].insert(s);
+ }
+ }
+ }
+
+ if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
+ auto reply = make_message<MRemoveSnaps>();
+ reply->snaps = m->snaps;
+ wait_for_commit(op, new C_ReplyOp(this, op, reply));
+ }
+
+ return true;
+}
+
+bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MMonGetPurgedSnaps>();
+ dout(7) << __func__ << " " << *m << dendl;
+
+ map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
+
+ string k = make_purged_snap_epoch_key(m->start);
+ auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
+ it->upper_bound(k);
+ unsigned long epoch = m->last;
+ while (it->valid()) {
+ if (it->key().find("purged_epoch_") != 0) {
+ break;
+ }
+ string k = it->key();
+ int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
+ if (n != 1) {
+ derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
+ } else if (epoch > m->last) {
+ break;
+ } else {
+ bufferlist bl = it->value();
+ auto p = bl.cbegin();
+ auto &v = r[epoch];
+ try {
+ ceph::decode(v, p);
+ } catch (ceph::buffer::error& e) {
+ derr << __func__ << " unable to parse value for key '" << it->key()
+ << "': \n";
+ bl.hexdump(*_dout);
+ *_dout << dendl;
+ }
+ n += 4 + v.size() * 16;
+ }
+ if (n > 1048576) {
+ // impose a semi-arbitrary limit to message size
+ break;
+ }
+ it->next();
+ }
+
+ auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
+ reply->purged_snaps.swap(r);
+ mon.send_reply(op, reply.detach());
+
+ return true;
+}
+
+// osd beacon
+bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ // check caps
+ auto session = op->get_session();
+ mon.no_reply(op);
+ if (!session) {
+ dout(10) << __func__ << " no monitor session!" << dendl;
+ return true;
+ }
+ if (!session->is_capable("osd", MON_CAP_X)) {
+ derr << __func__ << " received from entity "
+ << "with insufficient privileges " << session->caps << dendl;
+ return true;
+ }
+ // Always forward the beacon to the leader, even if they are the same as
+ // the old one. The leader will mark as down osds that haven't sent
+ // beacon for a few minutes.
+ return false;
+}
+
+bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ const auto beacon = op->get_req<MOSDBeacon>();
+ const auto src = beacon->get_orig_source();
+ dout(10) << __func__ << " " << *beacon
+ << " from " << src << dendl;
+ int from = src.num();
+
+ if (!src.is_osd() ||
+ !osdmap.is_up(from) ||
+ !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
+ if (src.is_osd() && !osdmap.is_up(from)) {
+ // share some new maps with this guy in case it may not be
+ // aware of its own deadness...
+ send_latest(op, beacon->version+1);
+ }
+ dout(1) << " ignoring beacon from non-active osd." << from << dendl;
+ return false; /* nothing to propose */
+ }
+
+ last_osd_report[from].first = ceph_clock_now();
+ last_osd_report[from].second = beacon->osd_beacon_report_interval;
+ osd_epochs[from] = beacon->version;
+
+ for (const auto& pg : beacon->pgs) {
+ if (auto* pool = osdmap.get_pg_pool(pg.pool()); pool != nullptr) {
+ unsigned pg_num = pool->get_pg_num();
+ last_epoch_clean.report(pg_num, pg, beacon->min_last_epoch_clean);
+ }
+ }
+
+ if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
+ beacon->last_purged_snaps_scrub) {
+ if (pending_inc.new_xinfo.count(from) == 0) {
+ pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
+ }
+ pending_inc.new_xinfo[from].last_purged_snaps_scrub =
+ beacon->last_purged_snaps_scrub;
+ return true;
+ } else {
+ return false; /* nothing to propose */
+ }
+}
+
+// ---------------
+// map helpers
+
+void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
+{
+ op->mark_osdmon_event(__func__);
+ dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
+ << " start " << start << dendl;
+ if (start == 0)
+ send_full(op);
+ else
+ send_incremental(op, start);
+}
+
+
+MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
+{
+ MOSDMap *r = new MOSDMap(mon.monmap->fsid, features);
+ get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
+ r->cluster_osdmap_trim_lower_bound = get_first_committed();
+ r->newest_map = osdmap.get_epoch();
+ return r;
+}
+
+MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
+{
+ dout(10) << "build_incremental [" << from << ".." << to << "] with features "
+ << std::hex << features << std::dec << dendl;
+ MOSDMap *m = new MOSDMap(mon.monmap->fsid, features);
+ m->cluster_osdmap_trim_lower_bound = get_first_committed();
+ m->newest_map = osdmap.get_epoch();
+
+ for (epoch_t e = to; e >= from && e > 0; e--) {
+ bufferlist bl;
+ int err = get_version(e, features, bl);
+ if (err == 0) {
+ ceph_assert(bl.length());
+ // if (get_version(e, bl) > 0) {
+ dout(20) << "build_incremental inc " << e << " "
+ << bl.length() << " bytes" << dendl;
+ m->incremental_maps[e] = bl;
+ } else {
+ ceph_assert(err == -ENOENT);
+ ceph_assert(!bl.length());
+ get_version_full(e, features, bl);
+ if (bl.length() > 0) {
+ //else if (get_version("full", e, bl) > 0) {
+ dout(20) << "build_incremental full " << e << " "
+ << bl.length() << " bytes" << dendl;
+ m->maps[e] = bl;
+ } else {
+ ceph_abort(); // we should have all maps.
+ }
+ }
+ }
+ return m;
+}
+
+void OSDMonitor::send_full(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
+ mon.send_reply(op, build_latest_full(op->get_session()->con_features));
+}
+
+void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
+{
+ op->mark_osdmon_event(__func__);
+
+ MonSession *s = op->get_session();
+ ceph_assert(s);
+
+ if (s->proxy_con) {
+ // oh, we can tell the other mon to do it
+ dout(10) << __func__ << " asking proxying mon to send_incremental from "
+ << first << dendl;
+ MRoute *r = new MRoute(s->proxy_tid, NULL);
+ r->send_osdmap_first = first;
+ s->proxy_con->send_message(r);
+ op->mark_event("reply: send routed send_osdmap_first reply");
+ } else {
+ // do it ourselves
+ send_incremental(first, s, false, op);
+ }
+}
+
+void OSDMonitor::send_incremental(epoch_t first,
+ MonSession *session,
+ bool onetime,
+ MonOpRequestRef req)
+{
+ dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
+ << " to " << session->name << dendl;
+
+ // get feature of the peer
+ // use quorum_con_features, if it's an anonymous connection.
+ uint64_t features = session->con_features ? session->con_features :
+ mon.get_quorum_con_features();
+
+ if (first <= session->osd_epoch) {
+ dout(10) << __func__ << " " << session->name << " should already have epoch "
+ << session->osd_epoch << dendl;
+ first = session->osd_epoch + 1;
+ }
+
+ if (first < get_first_committed()) {
+ MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
+ m->cluster_osdmap_trim_lower_bound = get_first_committed();
+ m->newest_map = osdmap.get_epoch();
+
+ first = get_first_committed();
+ bufferlist bl;
+ int err = get_version_full(first, features, bl);
+ ceph_assert(err == 0);
+ ceph_assert(bl.length());
+ dout(20) << "send_incremental starting with base full "
+ << first << " " << bl.length() << " bytes" << dendl;
+ m->maps[first] = bl;
+
+ if (req) {
+ mon.send_reply(req, m);
+ session->osd_epoch = first;
+ return;
+ } else {
+ session->con->send_message(m);
+ session->osd_epoch = first;
+ }
+ first++;
+ }
+
+ while (first <= osdmap.get_epoch()) {
+ epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
+ osdmap.get_epoch());
+ MOSDMap *m = build_incremental(first, last, features);
+
+ if (req) {
+ // send some maps. it may not be all of them, but it will get them
+ // started.
+ mon.send_reply(req, m);
+ } else {
+ session->con->send_message(m);
+ first = last + 1;
+ }
+ session->osd_epoch = last;
+ if (onetime || req)
+ break;
+ }
+}
+
+bool OSDMonitor::remove_pool_snap(std::string_view snapname,
+ pg_pool_t &pp, int64_t pool) {
+ snapid_t snapid = pp.snap_exists(snapname);
+ if (snapid) {
+ pp.remove_snap(snapid);
+ pending_inc.new_removed_snaps[pool].insert(snapid);
+ return true;
+ }
+ return false;
+};
+
+int OSDMonitor::get_version(version_t ver, bufferlist& bl)
+{
+ return get_version(ver, mon.get_quorum_con_features(), bl);
+}
+
+void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
+{
+ OSDMap::Incremental inc;
+ auto q = bl.cbegin();
+ inc.decode(q);
+ // always encode with subset of osdmap's canonical features
+ uint64_t f = features & inc.encode_features;
+ dout(20) << __func__ << " " << inc.epoch << " with features " << f
+ << dendl;
+ bl.clear();
+ if (inc.fullmap.length()) {
+ // embedded full map?
+ OSDMap m;
+ m.decode(inc.fullmap);
+ inc.fullmap.clear();
+ m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
+ }
+ if (inc.crush.length()) {
+ // embedded crush map
+ CrushWrapper c;
+ auto p = inc.crush.cbegin();
+ c.decode(p);
+ inc.crush.clear();
+ c.encode(inc.crush, f);
+ }
+ inc.encode(bl, f | CEPH_FEATURE_RESERVED);
+}
+
+void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
+{
+ OSDMap m;
+ auto q = bl.cbegin();
+ m.decode(q);
+ // always encode with subset of osdmap's canonical features
+ uint64_t f = features & m.get_encoding_features();
+ dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
+ << dendl;
+ bl.clear();
+ m.encode(bl, f | CEPH_FEATURE_RESERVED);
+}
+
+int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
+{
+ uint64_t significant_features = OSDMap::get_significant_features(features);
+ if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
+ return 0;
+ }
+ int ret = PaxosService::get_version(ver, bl);
+ if (ret < 0) {
+ return ret;
+ }
+ // NOTE: this check is imprecise; the OSDMap encoding features may
+ // be a subset of the latest mon quorum features, but worst case we
+ // reencode once and then cache the (identical) result under both
+ // feature masks.
+ if (significant_features !=
+ OSDMap::get_significant_features(mon.get_quorum_con_features())) {
+ reencode_incremental_map(bl, features);
+ }
+ inc_osd_cache.add_bytes({ver, significant_features}, bl);
+ return 0;
+}
+
+int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
+{
+ bufferlist inc_bl;
+ int err = get_version(ver, inc_bl);
+ ceph_assert(err == 0);
+ ceph_assert(inc_bl.length());
+
+ auto p = inc_bl.cbegin();
+ inc.decode(p);
+ dout(10) << __func__ << " "
+ << " epoch " << inc.epoch
+ << " inc_crc " << inc.inc_crc
+ << " full_crc " << inc.full_crc
+ << " encode_features " << inc.encode_features << dendl;
+ return 0;
+}
+
+int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
+{
+ dout(10) << __func__ << " ver " << ver << dendl;
+
+ version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
+ if (closest_pinned == 0) {
+ return -ENOENT;
+ }
+ if (closest_pinned > ver) {
+ dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
+ }
+ ceph_assert(closest_pinned <= ver);
+
+ dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
+
+ // get osdmap incremental maps and apply on top of this one.
+ bufferlist osdm_bl;
+ bool has_cached_osdmap = false;
+ for (version_t v = ver-1; v >= closest_pinned; --v) {
+ if (full_osd_cache.lookup({v, mon.get_quorum_con_features()},
+ &osdm_bl)) {
+ dout(10) << __func__ << " found map in cache ver " << v << dendl;
+ closest_pinned = v;
+ has_cached_osdmap = true;
+ break;
+ }
+ }
+
+ if (!has_cached_osdmap) {
+ int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
+ if (err != 0) {
+ derr << __func__ << " closest pinned map ver " << closest_pinned
+ << " not available! error: " << cpp_strerror(err) << dendl;
+ }
+ ceph_assert(err == 0);
+ }
+
+ ceph_assert(osdm_bl.length());
+
+ OSDMap osdm;
+ osdm.decode(osdm_bl);
+
+ dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
+ << " e" << osdm.epoch
+ << " crc " << osdm.get_crc()
+ << " -- applying incremental maps." << dendl;
+
+ uint64_t encode_features = 0;
+ for (version_t v = closest_pinned + 1; v <= ver; ++v) {
+ dout(20) << __func__ << " applying inc epoch " << v << dendl;
+
+ OSDMap::Incremental inc;
+ int err = get_inc(v, inc);
+ ceph_assert(err == 0);
+
+ encode_features = inc.encode_features;
+
+ err = osdm.apply_incremental(inc);
+ ceph_assert(err == 0);
+
+ // this block performs paranoid checks on map retrieval
+ if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
+ inc.full_crc != 0) {
+
+ uint64_t f = encode_features;
+ if (!f) {
+ f = (mon.quorum_con_features ? mon.quorum_con_features : -1);
+ }
+
+ // encode osdmap to force calculating crcs
+ bufferlist tbl;
+ osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
+ // decode osdmap to compare crcs with what's expected by incremental
+ OSDMap tosdm;
+ tosdm.decode(tbl);
+
+ if (tosdm.get_crc() != inc.full_crc) {
+ derr << __func__
+ << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
+ << ", expected " << inc.full_crc << ")" << dendl;
+ ceph_abort_msg("osdmap crc mismatch");
+ }
+ }
+
+ // note: we cannot add the recently computed map to the cache, as is,
+ // because we have not encoded the map into a bl.
+ }
+
+ if (!encode_features) {
+ dout(10) << __func__
+ << " last incremental map didn't have features;"
+ << " defaulting to quorum's or all" << dendl;
+ encode_features =
+ (mon.quorum_con_features ? mon.quorum_con_features : -1);
+ }
+ osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
+
+ return 0;
+}
+
+int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
+{
+ return get_version_full(ver, mon.get_quorum_con_features(), bl);
+}
+
+int OSDMonitor::get_version_full(version_t ver, uint64_t features,
+ bufferlist& bl)
+{
+ uint64_t significant_features = OSDMap::get_significant_features(features);
+ if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
+ return 0;
+ }
+ int ret = PaxosService::get_version_full(ver, bl);
+ if (ret == -ENOENT) {
+ // build map?
+ ret = get_full_from_pinned_map(ver, bl);
+ }
+ if (ret < 0) {
+ return ret;
+ }
+ // NOTE: this check is imprecise; the OSDMap encoding features may
+ // be a subset of the latest mon quorum features, but worst case we
+ // reencode once and then cache the (identical) result under both
+ // feature masks.
+ if (significant_features !=
+ OSDMap::get_significant_features(mon.get_quorum_con_features())) {
+ reencode_full_map(bl, features);
+ }
+ full_osd_cache.add_bytes({ver, significant_features}, bl);
+ return 0;
+}
+
+epoch_t OSDMonitor::blocklist(const entity_addrvec_t& av, utime_t until)
+{
+ dout(10) << "blocklist " << av << " until " << until << dendl;
+ for (auto a : av.v) {
+ if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
+ a.set_type(entity_addr_t::TYPE_ANY);
+ } else {
+ a.set_type(entity_addr_t::TYPE_LEGACY);
+ }
+ pending_inc.new_blocklist[a] = until;
+ }
+ return pending_inc.epoch;
+}
+
+epoch_t OSDMonitor::blocklist(entity_addr_t a, utime_t until)
+{
+ if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
+ a.set_type(entity_addr_t::TYPE_ANY);
+ } else {
+ a.set_type(entity_addr_t::TYPE_LEGACY);
+ }
+ dout(10) << "blocklist " << a << " until " << until << dendl;
+ pending_inc.new_blocklist[a] = until;
+ return pending_inc.epoch;
+}
+
+
+void OSDMonitor::check_osdmap_subs()
+{
+ dout(10) << __func__ << dendl;
+ if (!osdmap.get_epoch()) {
+ return;
+ }
+ auto osdmap_subs = mon.session_map.subs.find("osdmap");
+ if (osdmap_subs == mon.session_map.subs.end()) {
+ return;
+ }
+ auto p = osdmap_subs->second->begin();
+ while (!p.end()) {
+ auto sub = *p;
+ ++p;
+ check_osdmap_sub(sub);
+ }
+}
+
+void OSDMonitor::check_osdmap_sub(Subscription *sub)
+{
+ dout(10) << __func__ << " " << sub << " next " << sub->next
+ << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
+ if (sub->next <= osdmap.get_epoch()) {
+ if (sub->next >= 1)
+ send_incremental(sub->next, sub->session, sub->incremental_onetime);
+ else
+ sub->session->con->send_message(build_latest_full(sub->session->con_features));
+ if (sub->onetime)
+ mon.session_map.remove_sub(sub);
+ else
+ sub->next = osdmap.get_epoch() + 1;
+ }
+}
+
+void OSDMonitor::check_pg_creates_subs()
+{
+ if (!osdmap.get_num_up_osds()) {
+ return;
+ }
+ ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
+ mon.with_session_map([this](const MonSessionMap& session_map) {
+ auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
+ if (pg_creates_subs == session_map.subs.end()) {
+ return;
+ }
+ for (auto sub : *pg_creates_subs->second) {
+ check_pg_creates_sub(sub);
+ }
+ });
+}
+
+void OSDMonitor::check_pg_creates_sub(Subscription *sub)
+{
+ dout(20) << __func__ << " .. " << sub->session->name << dendl;
+ ceph_assert(sub->type == "osd_pg_creates");
+ // only send these if the OSD is up. we will check_subs() when they do
+ // come up so they will get the creates then.
+ if (sub->session->name.is_osd() &&
+ mon.osdmon()->osdmap.is_up(sub->session->name.num())) {
+ sub->next = send_pg_creates(sub->session->name.num(),
+ sub->session->con.get(),
+ sub->next);
+ }
+}
+
+void OSDMonitor::do_application_enable(int64_t pool_id,
+ const std::string &app_name,
+ const std::string &app_key,
+ const std::string &app_value,
+ bool force)
+{
+ ceph_assert(paxos.is_plugged() && is_writeable());
+
+ dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
+ << dendl;
+
+ ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
+
+ auto pp = osdmap.get_pg_pool(pool_id);
+ ceph_assert(pp != nullptr);
+
+ pg_pool_t p = *pp;
+ if (pending_inc.new_pools.count(pool_id)) {
+ p = pending_inc.new_pools[pool_id];
+ }
+
+ if (app_key.empty()) {
+ p.application_metadata.insert({app_name, {}});
+ } else {
+ if (force) {
+ p.application_metadata[app_name][app_key] = app_value;
+ } else {
+ p.application_metadata.insert({app_name, {{app_key, app_value}}});
+ }
+ }
+ p.last_change = pending_inc.epoch;
+ pending_inc.new_pools[pool_id] = p;
+}
+
+void OSDMonitor::do_set_pool_opt(int64_t pool_id,
+ pool_opts_t::key_t opt,
+ pool_opts_t::value_t val)
+{
+ dout(10) << __func__ << " pool: " << pool_id << " option: " << opt
+ << " val: " << val << dendl;
+ auto p = pending_inc.new_pools.try_emplace(
+ pool_id, *osdmap.get_pg_pool(pool_id));
+ p.first->second.opts.set(opt, val);
+}
+
+unsigned OSDMonitor::scan_for_creating_pgs(
+ const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
+ const mempool::osdmap::set<int64_t>& removed_pools,
+ utime_t modified,
+ creating_pgs_t* creating_pgs) const
+{
+ unsigned queued = 0;
+ for (auto& p : pools) {
+ int64_t poolid = p.first;
+ if (creating_pgs->created_pools.count(poolid)) {
+ dout(10) << __func__ << " already created " << poolid << dendl;
+ continue;
+ }
+ const pg_pool_t& pool = p.second;
+ int ruleno = pool.get_crush_rule();
+ if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
+ continue;
+
+ const auto last_scan_epoch = creating_pgs->last_scan_epoch;
+ const auto created = pool.get_last_change();
+ if (last_scan_epoch && created <= last_scan_epoch) {
+ dout(10) << __func__ << " no change in pool " << poolid
+ << " " << pool << dendl;
+ continue;
+ }
+ if (removed_pools.count(poolid)) {
+ dout(10) << __func__ << " pool is being removed: " << poolid
+ << " " << pool << dendl;
+ continue;
+ }
+ dout(10) << __func__ << " queueing pool create for " << poolid
+ << " " << pool << dendl;
+ creating_pgs->create_pool(poolid, pool.get_pg_num(),
+ created, modified);
+ queued++;
+ }
+ return queued;
+}
+
+void OSDMonitor::update_creating_pgs()
+{
+ dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
+ << creating_pgs.queue.size() << " pools in queue" << dendl;
+ decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
+ std::lock_guard<std::mutex> l(creating_pgs_lock);
+ for (const auto& pg : creating_pgs.pgs) {
+ int acting_primary = -1;
+ auto pgid = pg.first;
+ if (!osdmap.pg_exists(pgid)) {
+ dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
+ << dendl;
+ continue;
+ }
+ auto mapped = pg.second.create_epoch;
+ dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
+ spg_t spgid(pgid);
+ mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
+ // check the previous creating_pgs, look for the target to whom the pg was
+ // previously mapped
+ for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
+ const auto last_acting_primary = pgs_by_epoch.first;
+ for (auto& pgs: pgs_by_epoch.second) {
+ if (pgs.second.count(spgid)) {
+ if (last_acting_primary == acting_primary) {
+ mapped = pgs.first;
+ } else {
+ dout(20) << __func__ << " " << pgid << " "
+ << " acting_primary:" << last_acting_primary
+ << " -> " << acting_primary << dendl;
+ // note epoch if the target of the create message changed.
+ mapped = mapping.get_epoch();
+ }
+ break;
+ } else {
+ // newly creating
+ mapped = mapping.get_epoch();
+ }
+ }
+ }
+ dout(10) << __func__ << " will instruct osd." << acting_primary
+ << " to create " << pgid << "@" << mapped << dendl;
+ new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
+ }
+ creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
+ creating_pgs_epoch = mapping.get_epoch();
+}
+
+epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
+{
+ dout(30) << __func__ << " osd." << osd << " next=" << next
+ << " " << creating_pgs_by_osd_epoch << dendl;
+ std::lock_guard<std::mutex> l(creating_pgs_lock);
+ if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
+ dout(20) << __func__
+ << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
+ // the subscribers will be updated when the mapping is completed anyway
+ return next;
+ }
+ auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
+ if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
+ return next;
+ ceph_assert(!creating_pgs_by_epoch->second.empty());
+
+ auto m = make_message<MOSDPGCreate2>(creating_pgs_epoch);
+
+ epoch_t last = 0;
+ for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
+ epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
+ auto epoch = epoch_pgs->first;
+ auto& pgs = epoch_pgs->second;
+ dout(20) << __func__ << " osd." << osd << " from " << next
+ << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
+ last = epoch;
+ for (auto& pg : pgs) {
+ // Need the create time from the monitor using its clock to set
+ // last_scrub_stamp upon pg creation.
+ auto create = creating_pgs.pgs.find(pg.pgid);
+ ceph_assert(create != creating_pgs.pgs.end());
+ m->pgs.emplace(pg, make_pair(create->second.create_epoch,
+ create->second.create_stamp));
+ if (create->second.history.epoch_created) {
+ dout(20) << __func__ << " " << pg << " " << create->second.history
+ << " " << create->second.past_intervals << dendl;
+ m->pg_extra.emplace(pg, make_pair(create->second.history,
+ create->second.past_intervals));
+ }
+ dout(20) << __func__ << " will create " << pg
+ << " at " << create->second.create_epoch << dendl;
+ }
+ }
+ if (!m->pgs.empty()) {
+ con->send_message2(std::move(m));
+ } else {
+ dout(20) << __func__ << " osd." << osd << " from " << next
+ << " has nothing to send" << dendl;
+ return next;
+ }
+
+ // sub is current through last + 1
+ return last + 1;
+}
+
+// TICK
+
+
+void OSDMonitor::tick()
+{
+ if (!is_active()) return;
+
+ dout(10) << osdmap << dendl;
+
+ // always update osdmap manifest, regardless of being the leader.
+ load_osdmap_manifest();
+
+ // always tune priority cache manager memory on leader and peons
+ if (ceph_using_tcmalloc() && mon_memory_autotune) {
+ std::lock_guard l(balancer_lock);
+ if (pcm != nullptr) {
+ pcm->tune_memory();
+ pcm->balance();
+ _set_new_cache_sizes();
+ dout(10) << "tick balancer "
+ << " inc cache_bytes: " << inc_cache->get_cache_bytes()
+ << " inc comtd_bytes: " << inc_cache->get_committed_size()
+ << " inc used_bytes: " << inc_cache->_get_used_bytes()
+ << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
+ << dendl;
+ dout(10) << "tick balancer "
+ << " full cache_bytes: " << full_cache->get_cache_bytes()
+ << " full comtd_bytes: " << full_cache->get_committed_size()
+ << " full used_bytes: " << full_cache->_get_used_bytes()
+ << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
+ << dendl;
+ }
+ }
+
+ if (!mon.is_leader()) return;
+
+ bool do_propose = false;
+ utime_t now = ceph_clock_now();
+
+ if (handle_osd_timeouts(now, last_osd_report)) {
+ do_propose = true;
+ }
+
+ // mark osds down?
+ if (check_failures(now)) {
+ do_propose = true;
+ }
+
+ // Force a proposal if we need to prune; pruning is performed on
+ // ``encode_pending()``, hence why we need to regularly trigger a proposal
+ // even if there's nothing going on.
+ if (is_prune_enabled() && should_prune()) {
+ do_propose = true;
+ }
+
+ // mark down osds out?
+
+ /* can_mark_out() checks if we can mark osds as being out. The -1 has no
+ * influence at all. The decision is made based on the ratio of "in" osds,
+ * and the function returns false if this ratio is lower that the minimum
+ * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
+ */
+ if (can_mark_out(-1)) {
+ string down_out_subtree_limit = g_conf().get_val<string>(
+ "mon_osd_down_out_subtree_limit");
+ set<int> down_cache; // quick cache of down subtrees
+
+ map<int,utime_t>::iterator i = down_pending_out.begin();
+ while (i != down_pending_out.end()) {
+ int o = i->first;
+ utime_t down = now;
+ down -= i->second;
+ ++i;
+
+ if (osdmap.is_down(o) &&
+ osdmap.is_in(o) &&
+ can_mark_out(o)) {
+ utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
+ utime_t grace = orig_grace;
+ double my_grace = 0.0;
+
+ if (g_conf()->mon_osd_adjust_down_out_interval) {
+ // scale grace period the same way we do the heartbeat grace.
+ const osd_xinfo_t& xi = osdmap.get_xinfo(o);
+ double halflife = (double)g_conf()->mon_osd_laggy_halflife;
+ double decay_k = ::log(.5) / halflife;
+ double decay = exp((double)down * decay_k);
+ dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
+ << " down for " << down << " decay " << decay << dendl;
+ my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
+ grace += my_grace;
+ }
+
+ // is this an entire large subtree down?
+ if (down_out_subtree_limit.length()) {
+ int type = osdmap.crush->get_type_id(down_out_subtree_limit);
+ if (type > 0) {
+ if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
+ dout(10) << "tick entire containing " << down_out_subtree_limit
+ << " subtree for osd." << o
+ << " is down; resetting timer" << dendl;
+ // reset timer, too.
+ down_pending_out[o] = now;
+ continue;
+ }
+ }
+ }
+
+ bool down_out = !osdmap.is_destroyed(o) &&
+ g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
+ bool destroyed_out = osdmap.is_destroyed(o) &&
+ g_conf()->mon_osd_destroyed_out_interval > 0 &&
+ // this is not precise enough as we did not make a note when this osd
+ // was marked as destroyed, but let's not bother with that
+ // complexity for now.
+ down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
+ if (down_out || destroyed_out) {
+ dout(10) << "tick marking osd." << o << " OUT after " << down
+ << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
+ pending_inc.new_weight[o] = CEPH_OSD_OUT;
+
+ // set the AUTOOUT bit.
+ if (pending_inc.new_state.count(o) == 0)
+ pending_inc.new_state[o] = 0;
+ pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
+
+ // remember previous weight
+ if (pending_inc.new_xinfo.count(o) == 0)
+ pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
+ pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
+
+ do_propose = true;
+
+ mon.clog->info() << "Marking osd." << o << " out (has been down for "
+ << int(down.sec()) << " seconds)";
+ } else
+ continue;
+ }
+
+ down_pending_out.erase(o);
+ }
+ } else {
+ dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
+ }
+
+ // expire blocklisted items?
+ for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
+ p != osdmap.blocklist.end();
+ ++p) {
+ if (p->second < now) {
+ dout(10) << "expiring blocklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
+ pending_inc.old_blocklist.push_back(p->first);
+ do_propose = true;
+ }
+ }
+ for (auto p = osdmap.range_blocklist.begin();
+ p != osdmap.range_blocklist.end();
+ ++p) {
+ if (p->second < now) {
+ dout(10) << "expiring range_blocklist item " << p->first
+ << " expired " << p->second << " < now " << now << dendl;
+ pending_inc.old_range_blocklist.push_back(p->first);
+ do_propose = true;
+ }
+ }
+
+ if (try_prune_purged_snaps()) {
+ do_propose = true;
+ }
+
+ if (update_pools_status())
+ do_propose = true;
+
+ if (do_propose ||
+ !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
+ propose_pending();
+}
+
+void OSDMonitor::_set_new_cache_sizes()
+{
+ uint64_t cache_size = 0;
+ int64_t inc_alloc = 0;
+ int64_t full_alloc = 0;
+ int64_t kv_alloc = 0;
+
+ if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
+ cache_size = pcm->get_tuned_mem();
+ inc_alloc = inc_cache->get_committed_size();
+ full_alloc = full_cache->get_committed_size();
+ kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
+ }
+
+ inc_osd_cache.set_bytes(inc_alloc);
+ full_osd_cache.set_bytes(full_alloc);
+
+ dout(1) << __func__ << " cache_size:" << cache_size
+ << " inc_alloc: " << inc_alloc
+ << " full_alloc: " << full_alloc
+ << " kv_alloc: " << kv_alloc
+ << dendl;
+}
+
+bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
+ std::map<int, std::pair<utime_t, int>> &last_osd_report)
+{
+ utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
+ if (now - mon.get_leader_since() < timeo) {
+ // We haven't been the leader for long enough to consider OSD timeouts
+ return false;
+ }
+
+ int max_osd = osdmap.get_max_osd();
+ bool new_down = false;
+
+ for (int i=0; i < max_osd; ++i) {
+ dout(30) << __func__ << ": checking up on osd " << i << dendl;
+ if (!osdmap.exists(i)) {
+ last_osd_report.erase(i); // if any
+ continue;
+ }
+ if (!osdmap.is_up(i))
+ continue;
+ const std::map<int, std::pair<utime_t, int>>::const_iterator t = last_osd_report.find(i);
+ if (t == last_osd_report.end()) {
+ // it wasn't in the map; start the timer.
+ last_osd_report[i].first = now;
+ last_osd_report[i].second = 0;
+ } else if (can_mark_down(i)) {
+ utime_t diff = now - t->second.first;
+ // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
+ // to allow for the osd to miss a beacon.
+ int mon_osd_report_timeout = g_conf()->mon_osd_report_timeout;
+ utime_t max_timeout(std::max(mon_osd_report_timeout, 2 * t->second.second), 0);
+ if (diff > max_timeout) {
+ mon.clog->info() << "osd." << i << " marked down after no beacon for "
+ << diff << " seconds";
+ derr << "no beacon from osd." << i << " since " << t->second.first
+ << ", " << diff << " seconds ago. marking down" << dendl;
+ pending_inc.new_state[i] = CEPH_OSD_UP;
+ new_down = true;
+ }
+ }
+ }
+ return new_down;
+}
+
+static void dump_cpu_list(Formatter *f, const char *name,
+ const string& strlist)
+{
+ cpu_set_t cpu_set;
+ size_t cpu_set_size;
+ if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
+ return;
+ }
+ set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
+ f->open_array_section(name);
+ for (auto cpu : cpus) {
+ f->dump_int("cpu", cpu);
+ }
+ f->close_section();
+}
+
+void OSDMonitor::dump_info(Formatter *f)
+{
+ f->open_object_section("osdmap");
+ osdmap.dump(f, cct);
+ f->close_section();
+
+ f->open_array_section("osd_metadata");
+ for (int i=0; i<osdmap.get_max_osd(); ++i) {
+ if (osdmap.exists(i)) {
+ f->open_object_section("osd");
+ f->dump_unsigned("id", i);
+ dump_osd_metadata(i, f, NULL);
+ f->close_section();
+ }
+ }
+ f->close_section();
+
+ f->open_object_section("osdmap_clean_epochs");
+ f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
+
+ f->open_object_section("last_epoch_clean");
+ last_epoch_clean.dump(f);
+ f->close_section();
+
+ f->open_array_section("osd_epochs");
+ for (auto& osd_epoch : osd_epochs) {
+ f->open_object_section("osd");
+ f->dump_unsigned("id", osd_epoch.first);
+ f->dump_unsigned("epoch", osd_epoch.second);
+ f->close_section();
+ }
+ f->close_section(); // osd_epochs
+
+ f->close_section(); // osd_clean_epochs
+
+ f->dump_unsigned("osdmap_first_committed", get_first_committed());
+ f->dump_unsigned("osdmap_last_committed", get_last_committed());
+
+ f->open_object_section("crushmap");
+ osdmap.crush->dump(f);
+ f->close_section();
+
+ if (has_osdmap_manifest) {
+ f->open_object_section("osdmap_manifest");
+ osdmap_manifest.dump(f);
+ f->close_section();
+ }
+}
+
+namespace {
+ enum osd_pool_get_choices {
+ SIZE, MIN_SIZE,
+ PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
+ NODELETE, NOPGCHANGE, NOSIZECHANGE,
+ WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
+ HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
+ USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
+ CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
+ CACHE_TARGET_FULL_RATIO,
+ CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
+ ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
+ MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
+ HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
+ SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
+ RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
+ COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
+ COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
+ CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
+ PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
+ PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM,
+ DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK, PG_NUM_MAX, READ_RATIO };
+
+ std::set<osd_pool_get_choices>
+ subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
+ const std::set<osd_pool_get_choices>& second)
+ {
+ std::set<osd_pool_get_choices> result;
+ std::set_difference(first.begin(), first.end(),
+ second.begin(), second.end(),
+ std::inserter(result, result.end()));
+ return result;
+ }
+}
+
+
+bool OSDMonitor::preprocess_command(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MMonCommand>();
+ int r = 0;
+ bufferlist rdata;
+ stringstream ss, ds;
+
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, get_last_committed());
+ return true;
+ }
+
+ MonSession *session = op->get_session();
+ if (!session) {
+ derr << __func__ << " no session" << dendl;
+ mon.reply_command(op, -EACCES, "access denied", get_last_committed());
+ return true;
+ }
+
+ string prefix;
+ cmd_getval(cmdmap, "prefix", prefix);
+
+ string format = cmd_getval_or<string>(cmdmap, "format", "plain");
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+ if (prefix == "osd stat") {
+ if (f) {
+ f->open_object_section("osdmap");
+ osdmap.print_summary(f.get(), ds, "", true);
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ osdmap.print_summary(nullptr, ds, "", true);
+ rdata.append(ds);
+ }
+ }
+ else if (prefix == "osd dump" ||
+ prefix == "osd tree" ||
+ prefix == "osd tree-from" ||
+ prefix == "osd ls" ||
+ prefix == "osd getmap" ||
+ prefix == "osd getcrushmap" ||
+ prefix == "osd ls-tree" ||
+ prefix == "osd info") {
+
+ epoch_t epoch = cmd_getval_or<int64_t>(cmdmap, "epoch", osdmap.get_epoch());
+ bufferlist osdmap_bl;
+ int err = get_version_full(epoch, osdmap_bl);
+ if (err == -ENOENT) {
+ r = -ENOENT;
+ ss << "there is no map for epoch " << epoch;
+ goto reply;
+ }
+ ceph_assert(err == 0);
+ ceph_assert(osdmap_bl.length());
+
+ OSDMap *p;
+ if (epoch == osdmap.get_epoch()) {
+ p = &osdmap;
+ } else {
+ p = new OSDMap;
+ p->decode(osdmap_bl);
+ }
+
+ auto sg = make_scope_guard([&] {
+ if (p != &osdmap) {
+ delete p;
+ }
+ });
+
+ if (prefix == "osd dump") {
+ stringstream ds;
+ if (f) {
+ f->open_object_section("osdmap");
+ p->dump(f.get(), cct);
+ f->close_section();
+ f->flush(ds);
+ } else {
+ p->print(cct, ds);
+ }
+ rdata.append(ds);
+ if (!f)
+ ds << " ";
+ } else if (prefix == "osd ls") {
+ if (f) {
+ f->open_array_section("osds");
+ for (int i = 0; i < osdmap.get_max_osd(); i++) {
+ if (osdmap.exists(i)) {
+ f->dump_int("osd", i);
+ }
+ }
+ f->close_section();
+ f->flush(ds);
+ } else {
+ bool first = true;
+ for (int i = 0; i < osdmap.get_max_osd(); i++) {
+ if (osdmap.exists(i)) {
+ if (!first)
+ ds << "\n";
+ first = false;
+ ds << i;
+ }
+ }
+ }
+ rdata.append(ds);
+ } else if (prefix == "osd info") {
+ int64_t osd_id;
+ bool do_single_osd = true;
+ if (!cmd_getval(cmdmap, "id", osd_id)) {
+ do_single_osd = false;
+ }
+
+ if (do_single_osd && !osdmap.exists(osd_id)) {
+ ss << "osd." << osd_id << " does not exist";
+ r = -EINVAL;
+ goto reply;
+ }
+
+ if (f) {
+ if (do_single_osd) {
+ osdmap.dump_osd(osd_id, f.get());
+ } else {
+ osdmap.dump_osds(f.get());
+ }
+ f->flush(ds);
+ } else {
+ if (do_single_osd) {
+ osdmap.print_osd(osd_id, ds);
+ } else {
+ osdmap.print_osds(ds);
+ }
+ }
+ rdata.append(ds);
+ } else if (prefix == "osd tree" || prefix == "osd tree-from") {
+ string bucket;
+ if (prefix == "osd tree-from") {
+ cmd_getval(cmdmap, "bucket", bucket);
+ if (!osdmap.crush->name_exists(bucket)) {
+ ss << "bucket '" << bucket << "' does not exist";
+ r = -ENOENT;
+ goto reply;
+ }
+ int id = osdmap.crush->get_item_id(bucket);
+ if (id >= 0) {
+ ss << "\"" << bucket << "\" is not a bucket";
+ r = -EINVAL;
+ goto reply;
+ }
+ }
+
+ vector<string> states;
+ cmd_getval(cmdmap, "states", states);
+ unsigned filter = 0;
+ for (auto& s : states) {
+ if (s == "up") {
+ filter |= OSDMap::DUMP_UP;
+ } else if (s == "down") {
+ filter |= OSDMap::DUMP_DOWN;
+ } else if (s == "in") {
+ filter |= OSDMap::DUMP_IN;
+ } else if (s == "out") {
+ filter |= OSDMap::DUMP_OUT;
+ } else if (s == "destroyed") {
+ filter |= OSDMap::DUMP_DESTROYED;
+ } else {
+ ss << "unrecognized state '" << s << "'";
+ r = -EINVAL;
+ goto reply;
+ }
+ }
+ if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
+ (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
+ ss << "cannot specify both 'in' and 'out'";
+ r = -EINVAL;
+ goto reply;
+ }
+ if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
+ (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
+ ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
+ (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
+ ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
+ (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
+ ss << "can specify only one of 'up', 'down' and 'destroyed'";
+ r = -EINVAL;
+ goto reply;
+ }
+ if (f) {
+ f->open_object_section("tree");
+ p->print_tree(f.get(), NULL, filter, bucket);
+ f->close_section();
+ f->flush(ds);
+ } else {
+ p->print_tree(NULL, &ds, filter, bucket);
+ }
+ rdata.append(ds);
+ } else if (prefix == "osd getmap") {
+ rdata.append(osdmap_bl);
+ ss << "got osdmap epoch " << p->get_epoch();
+ } else if (prefix == "osd getcrushmap") {
+ p->crush->encode(rdata, mon.get_quorum_con_features());
+ ss << p->get_crush_version();
+ } else if (prefix == "osd ls-tree") {
+ string bucket_name;
+ cmd_getval(cmdmap, "name", bucket_name);
+ set<int> osds;
+ r = p->get_osds_by_bucket_name(bucket_name, &osds);
+ if (r == -ENOENT) {
+ ss << "\"" << bucket_name << "\" does not exist";
+ goto reply;
+ } else if (r < 0) {
+ ss << "can not parse bucket name:\"" << bucket_name << "\"";
+ goto reply;
+ }
+
+ if (f) {
+ f->open_array_section("osds");
+ for (auto &i : osds) {
+ if (osdmap.exists(i)) {
+ f->dump_int("osd", i);
+ }
+ }
+ f->close_section();
+ f->flush(ds);
+ } else {
+ bool first = true;
+ for (auto &i : osds) {
+ if (osdmap.exists(i)) {
+ if (!first)
+ ds << "\n";
+ first = false;
+ ds << i;
+ }
+ }
+ }
+
+ rdata.append(ds);
+ }
+ } else if (prefix == "osd getmaxosd") {
+ if (f) {
+ f->open_object_section("getmaxosd");
+ f->dump_unsigned("epoch", osdmap.get_epoch());
+ f->dump_int("max_osd", osdmap.get_max_osd());
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
+ rdata.append(ds);
+ }
+ } else if (prefix == "osd utilization") {
+ string out;
+ osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
+ if (f)
+ f->flush(rdata);
+ else
+ rdata.append(out);
+ r = 0;
+ goto reply;
+ } else if (prefix == "osd find") {
+ int64_t osd;
+ if (!cmd_getval(cmdmap, "id", osd)) {
+ ss << "unable to parse osd id value '"
+ << cmd_vartype_stringify(cmdmap["id"]) << "'";
+ r = -EINVAL;
+ goto reply;
+ }
+ if (!osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist";
+ r = -ENOENT;
+ goto reply;
+ }
+ string format;
+ cmd_getval(cmdmap, "format", format);
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+ f->open_object_section("osd_location");
+ f->dump_int("osd", osd);
+ f->dump_object("addrs", osdmap.get_addrs(osd));
+ f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
+
+ // try to identify host, pod/container name, etc.
+ map<string,string> m;
+ load_metadata(osd, m, nullptr);
+ if (auto p = m.find("hostname"); p != m.end()) {
+ f->dump_string("host", p->second);
+ }
+ for (auto& k : {
+ "pod_name", "pod_namespace", // set by rook
+ "container_name" // set by cephadm, ceph-ansible
+ }) {
+ if (auto p = m.find(k); p != m.end()) {
+ f->dump_string(k, p->second);
+ }
+ }
+
+ // crush is helpful too
+ f->open_object_section("crush_location");
+ map<string,string> loc = osdmap.crush->get_full_location(osd);
+ for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
+ f->dump_string(p->first.c_str(), p->second);
+ f->close_section();
+ f->close_section();
+ f->flush(rdata);
+ } else if (prefix == "osd metadata") {
+ int64_t osd = -1;
+ if (cmd_vartype_stringify(cmdmap["id"]).size() &&
+ !cmd_getval(cmdmap, "id", osd)) {
+ ss << "unable to parse osd id value '"
+ << cmd_vartype_stringify(cmdmap["id"]) << "'";
+ r = -EINVAL;
+ goto reply;
+ }
+ if (osd >= 0 && !osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist";
+ r = -ENOENT;
+ goto reply;
+ }
+ string format;
+ cmd_getval(cmdmap, "format", format);
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+ if (osd >= 0) {
+ f->open_object_section("osd_metadata");
+ f->dump_unsigned("id", osd);
+ r = dump_osd_metadata(osd, f.get(), &ss);
+ if (r < 0)
+ goto reply;
+ f->close_section();
+ } else {
+ r = 0;
+ f->open_array_section("osd_metadata");
+ for (int i=0; i<osdmap.get_max_osd(); ++i) {
+ if (osdmap.exists(i)) {
+ f->open_object_section("osd");
+ f->dump_unsigned("id", i);
+ r = dump_osd_metadata(i, f.get(), NULL);
+ if (r == -EINVAL || r == -ENOENT) {
+ // Drop error, continue to get other daemons' metadata
+ dout(4) << "No metadata for osd." << i << dendl;
+ r = 0;
+ } else if (r < 0) {
+ // Unexpected error
+ goto reply;
+ }
+ f->close_section();
+ }
+ }
+ f->close_section();
+ }
+ f->flush(rdata);
+ } else if (prefix == "osd versions") {
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ count_metadata("ceph_version", f.get());
+ f->flush(rdata);
+ r = 0;
+ } else if (prefix == "osd count-metadata") {
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ string field;
+ cmd_getval(cmdmap, "property", field);
+ count_metadata(field, f.get());
+ f->flush(rdata);
+ r = 0;
+ } else if (prefix == "osd numa-status") {
+ TextTable tbl;
+ if (f) {
+ f->open_array_section("osds");
+ } else {
+ tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
+ tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
+ }
+ for (int i=0; i<osdmap.get_max_osd(); ++i) {
+ if (osdmap.exists(i)) {
+ map<string,string> m;
+ ostringstream err;
+ if (load_metadata(i, m, &err) < 0) {
+ continue;
+ }
+ string host;
+ auto p = m.find("hostname");
+ if (p != m.end()) {
+ host = p->second;
+ }
+ if (f) {
+ f->open_object_section("osd");
+ f->dump_int("osd", i);
+ f->dump_string("host", host);
+ for (auto n : { "network_numa_node", "objectstore_numa_node",
+ "numa_node" }) {
+ p = m.find(n);
+ if (p != m.end()) {
+ f->dump_int(n, atoi(p->second.c_str()));
+ }
+ }
+ for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
+ p = m.find(n);
+ if (p != m.end()) {
+ list<string> ls = get_str_list(p->second, ",");
+ f->open_array_section(n);
+ for (auto node : ls) {
+ f->dump_int("node", atoi(node.c_str()));
+ }
+ f->close_section();
+ }
+ }
+ for (auto n : { "numa_node_cpus" }) {
+ p = m.find(n);
+ if (p != m.end()) {
+ dump_cpu_list(f.get(), n, p->second);
+ }
+ }
+ f->close_section();
+ } else {
+ tbl << i;
+ tbl << host;
+ p = m.find("network_numa_nodes");
+ if (p != m.end()) {
+ tbl << p->second;
+ } else {
+ tbl << "-";
+ }
+ p = m.find("objectstore_numa_nodes");
+ if (p != m.end()) {
+ tbl << p->second;
+ } else {
+ tbl << "-";
+ }
+ p = m.find("numa_node");
+ auto q = m.find("numa_node_cpus");
+ if (p != m.end() && q != m.end()) {
+ tbl << p->second;
+ tbl << q->second;
+ } else {
+ tbl << "-";
+ tbl << "-";
+ }
+ tbl << TextTable::endrow;
+ }
+ }
+ }
+ if (f) {
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ rdata.append(stringify(tbl));
+ }
+ } else if (prefix == "osd map") {
+ string poolstr, objstr, namespacestr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ cmd_getval(cmdmap, "object", objstr);
+ cmd_getval(cmdmap, "nspace", namespacestr);
+
+ int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+ if (pool < 0) {
+ ss << "pool " << poolstr << " does not exist";
+ r = -ENOENT;
+ goto reply;
+ }
+ object_locator_t oloc(pool, namespacestr);
+ object_t oid(objstr);
+ pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
+ pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
+ vector<int> up, acting;
+ int up_p, acting_p;
+ osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
+
+ string fullobjname;
+ if (!namespacestr.empty())
+ fullobjname = namespacestr + string("/") + oid.name;
+ else
+ fullobjname = oid.name;
+ if (f) {
+ f->open_object_section("osd_map");
+ f->dump_unsigned("epoch", osdmap.get_epoch());
+ f->dump_string("pool", poolstr);
+ f->dump_int("pool_id", pool);
+ f->dump_stream("objname") << fullobjname;
+ f->dump_stream("raw_pgid") << pgid;
+ f->dump_stream("pgid") << mpgid;
+ f->open_array_section("up");
+ for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
+ f->dump_int("osd", *p);
+ f->close_section();
+ f->dump_int("up_primary", up_p);
+ f->open_array_section("acting");
+ for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
+ f->dump_int("osd", *p);
+ f->close_section();
+ f->dump_int("acting_primary", acting_p);
+ f->close_section(); // osd_map
+ f->flush(rdata);
+ } else {
+ ds << "osdmap e" << osdmap.get_epoch()
+ << " pool '" << poolstr << "' (" << pool << ")"
+ << " object '" << fullobjname << "' ->"
+ << " pg " << pgid << " (" << mpgid << ")"
+ << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
+ << pg_vector_string(acting) << ", p" << acting_p << ")";
+ rdata.append(ds);
+ }
+
+ } else if (prefix == "pg map") {
+ pg_t pgid;
+ vector<int> up, acting;
+ r = parse_pgid(cmdmap, ss, pgid);
+ if (r < 0)
+ goto reply;
+ pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
+ osdmap.pg_to_up_acting_osds(pgid, up, acting);
+ if (f) {
+ f->open_object_section("pg_map");
+ f->dump_unsigned("epoch", osdmap.get_epoch());
+ f->dump_stream("raw_pgid") << pgid;
+ f->dump_stream("pgid") << mpgid;
+ f->open_array_section("up");
+ for (auto osd : up) {
+ f->dump_int("up_osd", osd);
+ }
+ f->close_section();
+ f->open_array_section("acting");
+ for (auto osd : acting) {
+ f->dump_int("acting_osd", osd);
+ }
+ f->close_section();
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ds << "osdmap e" << osdmap.get_epoch()
+ << " pg " << pgid << " (" << mpgid << ")"
+ << " -> up " << up << " acting " << acting;
+ rdata.append(ds);
+ }
+ goto reply;
+
+ } else if (prefix == "osd lspools") {
+ if (f)
+ f->open_array_section("pools");
+ for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
+ p != osdmap.pools.end();
+ ++p) {
+ if (f) {
+ f->open_object_section("pool");
+ f->dump_int("poolnum", p->first);
+ f->dump_string("poolname", osdmap.pool_name[p->first]);
+ f->close_section();
+ } else {
+ ds << p->first << ' ' << osdmap.pool_name[p->first];
+ if (next(p) != osdmap.pools.end()) {
+ ds << '\n';
+ }
+ }
+ }
+ if (f) {
+ f->close_section();
+ f->flush(ds);
+ }
+ rdata.append(ds);
+ } else if (prefix == "osd blocklist ls" ||
+ prefix == "osd blacklist ls") {
+ if (f)
+ f->open_array_section("blocklist");
+
+ for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
+ p != osdmap.blocklist.end();
+ ++p) {
+ if (f) {
+ f->open_object_section("entry");
+ f->dump_string("addr", p->first.get_legacy_str());
+ f->dump_stream("until") << p->second;
+ f->close_section();
+ } else {
+ stringstream ss;
+ string s;
+ ss << p->first << " " << p->second;
+ getline(ss, s);
+ s += "\n";
+ rdata.append(s);
+ }
+ }
+ if (f) {
+ f->close_section();
+ f->flush(rdata);
+ }
+ if (f)
+ f->open_array_section("range_blocklist");
+
+ for (auto p = osdmap.range_blocklist.begin();
+ p != osdmap.range_blocklist.end();
+ ++p) {
+ if (f) {
+ f->open_object_section("entry");
+ f->dump_string("range", p->first.get_legacy_str());
+ f->dump_stream("until") << p->second;
+ f->close_section();
+ } else {
+ stringstream ss;
+ string s;
+ ss << p->first << " " << p->second;
+ getline(ss, s);
+ s += "\n";
+ rdata.append(s);
+ }
+ }
+ if (f) {
+ f->close_section();
+ f->flush(rdata);
+ }
+ ss << "listed " << osdmap.blocklist.size() + osdmap.range_blocklist.size() << " entries";
+
+ } else if (prefix == "osd pool ls") {
+ string detail;
+ cmd_getval(cmdmap, "detail", detail);
+ if (!f && detail == "detail") {
+ ostringstream ss;
+ osdmap.print_pools(cct, ss);
+ rdata.append(ss.str());
+ } else {
+ if (f)
+ f->open_array_section("pools");
+ for (auto &[pid, pdata] : osdmap.get_pools()) {
+ if (f) {
+ if (detail == "detail") {
+ f->open_object_section("pool");
+ f->dump_int("pool_id", pid);
+ f->dump_string("pool_name", osdmap.get_pool_name(pid));
+ pdata.dump(f.get());
+ osdmap.dump_read_balance_score(cct, pid, pdata, f.get());
+ f->close_section();
+ } else {
+ f->dump_string("pool_name", osdmap.get_pool_name(pid));
+ }
+ } else {
+ rdata.append(osdmap.get_pool_name(pid) + "\n");
+ }
+ }
+ if (f) {
+ f->close_section();
+ f->flush(rdata);
+ }
+ }
+
+ } else if (prefix == "osd crush get-tunable") {
+ string tunable;
+ cmd_getval(cmdmap, "tunable", tunable);
+ ostringstream rss;
+ if (f)
+ f->open_object_section("tunable");
+ if (tunable == "straw_calc_version") {
+ if (f)
+ f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
+ else
+ rss << osdmap.crush->get_straw_calc_version() << "\n";
+ } else {
+ r = -EINVAL;
+ goto reply;
+ }
+ if (f) {
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ rdata.append(rss.str());
+ }
+ r = 0;
+
+ } else if (prefix == "osd pool get") {
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+ if (pool < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+
+ const pg_pool_t *p = osdmap.get_pg_pool(pool);
+ string var;
+ cmd_getval(cmdmap, "var", var);
+
+ typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
+ const choices_map_t ALL_CHOICES = {
+ {"size", SIZE},
+ {"min_size", MIN_SIZE},
+ {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
+ {"crush_rule", CRUSH_RULE},
+ {"hashpspool", HASHPSPOOL},
+ {"eio", POOL_EIO},
+ {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
+ {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
+ {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
+ {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
+ {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
+ {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
+ {"use_gmt_hitset", USE_GMT_HITSET},
+ {"target_max_objects", TARGET_MAX_OBJECTS},
+ {"target_max_bytes", TARGET_MAX_BYTES},
+ {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
+ {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
+ {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
+ {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
+ {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
+ {"erasure_code_profile", ERASURE_CODE_PROFILE},
+ {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
+ {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
+ {"fast_read", FAST_READ},
+ {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
+ {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
+ {"scrub_min_interval", SCRUB_MIN_INTERVAL},
+ {"scrub_max_interval", SCRUB_MAX_INTERVAL},
+ {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
+ {"recovery_priority", RECOVERY_PRIORITY},
+ {"recovery_op_priority", RECOVERY_OP_PRIORITY},
+ {"scrub_priority", SCRUB_PRIORITY},
+ {"compression_mode", COMPRESSION_MODE},
+ {"compression_algorithm", COMPRESSION_ALGORITHM},
+ {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
+ {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
+ {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
+ {"csum_type", CSUM_TYPE},
+ {"csum_max_block", CSUM_MAX_BLOCK},
+ {"csum_min_block", CSUM_MIN_BLOCK},
+ {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
+ {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
+ {"pg_num_min", PG_NUM_MIN},
+ {"pg_num_max", PG_NUM_MAX},
+ {"target_size_bytes", TARGET_SIZE_BYTES},
+ {"target_size_ratio", TARGET_SIZE_RATIO},
+ {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
+ {"dedup_tier", DEDUP_TIER},
+ {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
+ {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
+ {"bulk", BULK},
+ {"read_ratio", READ_RATIO}
+ };
+
+ typedef std::set<osd_pool_get_choices> choices_set_t;
+
+ const choices_set_t ONLY_TIER_CHOICES = {
+ HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
+ TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
+ CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
+ CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
+ MIN_READ_RECENCY_FOR_PROMOTE,
+ MIN_WRITE_RECENCY_FOR_PROMOTE,
+ HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
+ };
+ const choices_set_t ONLY_ERASURE_CHOICES = {
+ EC_OVERWRITES, ERASURE_CODE_PROFILE
+ };
+ const choices_set_t ONLY_REPLICA_CHOICES = {
+ READ_RATIO
+ };
+
+ choices_set_t selected_choices;
+ if (var == "all") {
+ for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
+ it != ALL_CHOICES.end(); ++it) {
+ selected_choices.insert(it->second);
+ }
+
+ if(!p->is_tier()) {
+ selected_choices = subtract_second_from_first(selected_choices,
+ ONLY_TIER_CHOICES);
+ }
+
+ if(!p->is_erasure()) {
+ selected_choices = subtract_second_from_first(selected_choices,
+ ONLY_ERASURE_CHOICES);
+ }
+ if(!p->is_replicated()) {
+ selected_choices = subtract_second_from_first(selected_choices,
+ ONLY_REPLICA_CHOICES);
+ }
+ } else /* var != "all" */ {
+ choices_map_t::const_iterator found = ALL_CHOICES.find(var);
+ if (found == ALL_CHOICES.end()) {
+ ss << "pool '" << poolstr
+ << "': invalid variable: '" << var << "'";
+ r = -EINVAL;
+ goto reply;
+ }
+
+ osd_pool_get_choices selected = found->second;
+
+ if (!p->is_tier() &&
+ ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
+ ss << "pool '" << poolstr
+ << "' is not a tier pool: variable not applicable";
+ r = -EACCES;
+ goto reply;
+ }
+
+ if (!p->is_erasure() &&
+ ONLY_ERASURE_CHOICES.find(selected)
+ != ONLY_ERASURE_CHOICES.end()) {
+ ss << "pool '" << poolstr
+ << "' is not a erasure pool: variable not applicable";
+ r = -EACCES;
+ goto reply;
+ }
+
+ if (!p->is_replicated() &&
+ ONLY_REPLICA_CHOICES.find(selected)
+ != ONLY_REPLICA_CHOICES.end()) {
+ ss << "pool '" << poolstr
+ << "' is not a replicated pool: variable not applicable";
+ r = -EACCES;
+ goto reply;
+ }
+
+ if (pool_opts_t::is_opt_name(var) &&
+ !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
+ ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+
+ selected_choices.insert(selected);
+ }
+
+ if (f) {
+ f->open_object_section("pool");
+ f->dump_string("pool", poolstr);
+ f->dump_int("pool_id", pool);
+ for(choices_set_t::const_iterator it = selected_choices.begin();
+ it != selected_choices.end(); ++it) {
+ choices_map_t::const_iterator i;
+ for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
+ if (i->second == *it) {
+ break;
+ }
+ }
+ ceph_assert(i != ALL_CHOICES.end());
+ switch(*it) {
+ case PG_NUM:
+ f->dump_int("pg_num", p->get_pg_num());
+ break;
+ case PGP_NUM:
+ f->dump_int("pgp_num", p->get_pgp_num());
+ break;
+ case SIZE:
+ f->dump_int("size", p->get_size());
+ break;
+ case MIN_SIZE:
+ f->dump_int("min_size", p->get_min_size());
+ break;
+ case CRUSH_RULE:
+ if (osdmap.crush->rule_exists(p->get_crush_rule())) {
+ f->dump_string("crush_rule", osdmap.crush->get_rule_name(
+ p->get_crush_rule()));
+ } else {
+ f->dump_string("crush_rule", stringify(p->get_crush_rule()));
+ }
+ break;
+ case EC_OVERWRITES:
+ f->dump_bool("allow_ec_overwrites",
+ p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
+ break;
+ case PG_AUTOSCALE_MODE:
+ f->dump_string("pg_autoscale_mode",
+ pg_pool_t::get_pg_autoscale_mode_name(
+ p->pg_autoscale_mode));
+ break;
+ case HASHPSPOOL:
+ case POOL_EIO:
+ case NODELETE:
+ case BULK:
+ case NOPGCHANGE:
+ case NOSIZECHANGE:
+ case WRITE_FADVISE_DONTNEED:
+ case NOSCRUB:
+ case NODEEP_SCRUB:
+ f->dump_bool(i->first.c_str(),
+ p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
+ break;
+ case HIT_SET_PERIOD:
+ f->dump_int("hit_set_period", p->hit_set_period);
+ break;
+ case HIT_SET_COUNT:
+ f->dump_int("hit_set_count", p->hit_set_count);
+ break;
+ case HIT_SET_TYPE:
+ f->dump_string("hit_set_type",
+ HitSet::get_type_name(p->hit_set_params.get_type()));
+ break;
+ case HIT_SET_FPP:
+ {
+ if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
+ BloomHitSet::Params *bloomp =
+ static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
+ f->dump_float("hit_set_fpp", bloomp->get_fpp());
+ } else if(var != "all") {
+ f->close_section();
+ ss << "hit set is not of type Bloom; " <<
+ "invalid to get a false positive rate!";
+ r = -EINVAL;
+ goto reply;
+ }
+ }
+ break;
+ case USE_GMT_HITSET:
+ f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
+ break;
+ case TARGET_MAX_OBJECTS:
+ f->dump_unsigned("target_max_objects", p->target_max_objects);
+ break;
+ case TARGET_MAX_BYTES:
+ f->dump_unsigned("target_max_bytes", p->target_max_bytes);
+ break;
+ case CACHE_TARGET_DIRTY_RATIO:
+ f->dump_unsigned("cache_target_dirty_ratio_micro",
+ p->cache_target_dirty_ratio_micro);
+ f->dump_float("cache_target_dirty_ratio",
+ ((float)p->cache_target_dirty_ratio_micro/1000000));
+ break;
+ case CACHE_TARGET_DIRTY_HIGH_RATIO:
+ f->dump_unsigned("cache_target_dirty_high_ratio_micro",
+ p->cache_target_dirty_high_ratio_micro);
+ f->dump_float("cache_target_dirty_high_ratio",
+ ((float)p->cache_target_dirty_high_ratio_micro/1000000));
+ break;
+ case CACHE_TARGET_FULL_RATIO:
+ f->dump_unsigned("cache_target_full_ratio_micro",
+ p->cache_target_full_ratio_micro);
+ f->dump_float("cache_target_full_ratio",
+ ((float)p->cache_target_full_ratio_micro/1000000));
+ break;
+ case CACHE_MIN_FLUSH_AGE:
+ f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
+ break;
+ case CACHE_MIN_EVICT_AGE:
+ f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
+ break;
+ case ERASURE_CODE_PROFILE:
+ f->dump_string("erasure_code_profile", p->erasure_code_profile);
+ break;
+ case MIN_READ_RECENCY_FOR_PROMOTE:
+ f->dump_int("min_read_recency_for_promote",
+ p->min_read_recency_for_promote);
+ break;
+ case MIN_WRITE_RECENCY_FOR_PROMOTE:
+ f->dump_int("min_write_recency_for_promote",
+ p->min_write_recency_for_promote);
+ break;
+ case FAST_READ:
+ f->dump_int("fast_read", p->fast_read);
+ break;
+ case HIT_SET_GRADE_DECAY_RATE:
+ f->dump_int("hit_set_grade_decay_rate",
+ p->hit_set_grade_decay_rate);
+ break;
+ case HIT_SET_SEARCH_LAST_N:
+ f->dump_int("hit_set_search_last_n",
+ p->hit_set_search_last_n);
+ break;
+ case SCRUB_MIN_INTERVAL:
+ case SCRUB_MAX_INTERVAL:
+ case DEEP_SCRUB_INTERVAL:
+ case RECOVERY_PRIORITY:
+ case RECOVERY_OP_PRIORITY:
+ case SCRUB_PRIORITY:
+ case COMPRESSION_MODE:
+ case COMPRESSION_ALGORITHM:
+ case COMPRESSION_REQUIRED_RATIO:
+ case COMPRESSION_MAX_BLOB_SIZE:
+ case COMPRESSION_MIN_BLOB_SIZE:
+ case CSUM_TYPE:
+ case CSUM_MAX_BLOCK:
+ case CSUM_MIN_BLOCK:
+ case FINGERPRINT_ALGORITHM:
+ case PG_NUM_MIN:
+ case PG_NUM_MAX:
+ case TARGET_SIZE_BYTES:
+ case TARGET_SIZE_RATIO:
+ case PG_AUTOSCALE_BIAS:
+ case DEDUP_TIER:
+ case DEDUP_CHUNK_ALGORITHM:
+ case DEDUP_CDC_CHUNK_SIZE:
+ case READ_RATIO:
+ pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
+ if (p->opts.is_set(key)) {
+ if(*it == CSUM_TYPE) {
+ int64_t val;
+ p->opts.get(pool_opts_t::CSUM_TYPE, &val);
+ f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
+ } else {
+ p->opts.dump(i->first, f.get());
+ }
+ }
+ break;
+ }
+ }
+ f->close_section();
+ f->flush(rdata);
+ } else /* !f */ {
+ for(choices_set_t::const_iterator it = selected_choices.begin();
+ it != selected_choices.end(); ++it) {
+ choices_map_t::const_iterator i;
+ switch(*it) {
+ case PG_NUM:
+ ss << "pg_num: " << p->get_pg_num() << "\n";
+ break;
+ case PGP_NUM:
+ ss << "pgp_num: " << p->get_pgp_num() << "\n";
+ break;
+ case SIZE:
+ ss << "size: " << p->get_size() << "\n";
+ break;
+ case MIN_SIZE:
+ ss << "min_size: " << p->get_min_size() << "\n";
+ break;
+ case CRUSH_RULE:
+ if (osdmap.crush->rule_exists(p->get_crush_rule())) {
+ ss << "crush_rule: " << osdmap.crush->get_rule_name(
+ p->get_crush_rule()) << "\n";
+ } else {
+ ss << "crush_rule: " << p->get_crush_rule() << "\n";
+ }
+ break;
+ case PG_AUTOSCALE_MODE:
+ ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
+ p->pg_autoscale_mode) <<"\n";
+ break;
+ case HIT_SET_PERIOD:
+ ss << "hit_set_period: " << p->hit_set_period << "\n";
+ break;
+ case HIT_SET_COUNT:
+ ss << "hit_set_count: " << p->hit_set_count << "\n";
+ break;
+ case HIT_SET_TYPE:
+ ss << "hit_set_type: " <<
+ HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
+ break;
+ case HIT_SET_FPP:
+ {
+ if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
+ BloomHitSet::Params *bloomp =
+ static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
+ ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
+ } else if(var != "all") {
+ ss << "hit set is not of type Bloom; " <<
+ "invalid to get a false positive rate!";
+ r = -EINVAL;
+ goto reply;
+ }
+ }
+ break;
+ case USE_GMT_HITSET:
+ ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
+ break;
+ case TARGET_MAX_OBJECTS:
+ ss << "target_max_objects: " << p->target_max_objects << "\n";
+ break;
+ case TARGET_MAX_BYTES:
+ ss << "target_max_bytes: " << p->target_max_bytes << "\n";
+ break;
+ case CACHE_TARGET_DIRTY_RATIO:
+ ss << "cache_target_dirty_ratio: "
+ << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
+ break;
+ case CACHE_TARGET_DIRTY_HIGH_RATIO:
+ ss << "cache_target_dirty_high_ratio: "
+ << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
+ break;
+ case CACHE_TARGET_FULL_RATIO:
+ ss << "cache_target_full_ratio: "
+ << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
+ break;
+ case CACHE_MIN_FLUSH_AGE:
+ ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
+ break;
+ case CACHE_MIN_EVICT_AGE:
+ ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
+ break;
+ case ERASURE_CODE_PROFILE:
+ ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
+ break;
+ case MIN_READ_RECENCY_FOR_PROMOTE:
+ ss << "min_read_recency_for_promote: " <<
+ p->min_read_recency_for_promote << "\n";
+ break;
+ case HIT_SET_GRADE_DECAY_RATE:
+ ss << "hit_set_grade_decay_rate: " <<
+ p->hit_set_grade_decay_rate << "\n";
+ break;
+ case HIT_SET_SEARCH_LAST_N:
+ ss << "hit_set_search_last_n: " <<
+ p->hit_set_search_last_n << "\n";
+ break;
+ case EC_OVERWRITES:
+ ss << "allow_ec_overwrites: " <<
+ (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
+ "\n";
+ break;
+ case HASHPSPOOL:
+ case POOL_EIO:
+ case NODELETE:
+ case BULK:
+ case NOPGCHANGE:
+ case NOSIZECHANGE:
+ case WRITE_FADVISE_DONTNEED:
+ case NOSCRUB:
+ case NODEEP_SCRUB:
+ for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
+ if (i->second == *it)
+ break;
+ }
+ ceph_assert(i != ALL_CHOICES.end());
+ ss << i->first << ": " <<
+ (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
+ "true" : "false") << "\n";
+ break;
+ case MIN_WRITE_RECENCY_FOR_PROMOTE:
+ ss << "min_write_recency_for_promote: " <<
+ p->min_write_recency_for_promote << "\n";
+ break;
+ case FAST_READ:
+ ss << "fast_read: " << p->fast_read << "\n";
+ break;
+ case SCRUB_MIN_INTERVAL:
+ case SCRUB_MAX_INTERVAL:
+ case DEEP_SCRUB_INTERVAL:
+ case RECOVERY_PRIORITY:
+ case RECOVERY_OP_PRIORITY:
+ case SCRUB_PRIORITY:
+ case COMPRESSION_MODE:
+ case COMPRESSION_ALGORITHM:
+ case COMPRESSION_REQUIRED_RATIO:
+ case COMPRESSION_MAX_BLOB_SIZE:
+ case COMPRESSION_MIN_BLOB_SIZE:
+ case CSUM_TYPE:
+ case CSUM_MAX_BLOCK:
+ case CSUM_MIN_BLOCK:
+ case FINGERPRINT_ALGORITHM:
+ case PG_NUM_MIN:
+ case PG_NUM_MAX:
+ case TARGET_SIZE_BYTES:
+ case TARGET_SIZE_RATIO:
+ case PG_AUTOSCALE_BIAS:
+ case DEDUP_TIER:
+ case DEDUP_CHUNK_ALGORITHM:
+ case DEDUP_CDC_CHUNK_SIZE:
+ case READ_RATIO:
+ for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
+ if (i->second == *it)
+ break;
+ }
+ ceph_assert(i != ALL_CHOICES.end());
+ {
+ pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
+ if (p->opts.is_set(key)) {
+ if(key == pool_opts_t::CSUM_TYPE) {
+ int64_t val;
+ p->opts.get(key, &val);
+ ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
+ } else {
+ ss << i->first << ": " << p->opts.get(key) << "\n";
+ }
+ }
+ }
+ break;
+ }
+ rdata.append(ss.str());
+ ss.str("");
+ }
+ }
+ r = 0;
+ } else if (prefix == "osd pool get-quota") {
+ string pool_name;
+ cmd_getval(cmdmap, "pool", pool_name);
+
+ int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
+ if (poolid < 0) {
+ ceph_assert(poolid == -ENOENT);
+ ss << "unrecognized pool '" << pool_name << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(poolid);
+ const pool_stat_t* pstat = mon.mgrstatmon()->get_pool_stat(poolid);
+ if (!pstat) {
+ ss << "no stats for pool '" << pool_name << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+ const object_stat_sum_t& sum = pstat->stats.sum;
+ if (f) {
+ f->open_object_section("pool_quotas");
+ f->dump_string("pool_name", pool_name);
+ f->dump_unsigned("pool_id", poolid);
+ f->dump_unsigned("quota_max_objects", p->quota_max_objects);
+ f->dump_int("current_num_objects", sum.num_objects);
+ f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
+ f->dump_int("current_num_bytes", sum.num_bytes);
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ stringstream rs;
+ rs << "quotas for pool '" << pool_name << "':\n"
+ << " max objects: ";
+ if (p->quota_max_objects == 0)
+ rs << "N/A";
+ else {
+ rs << si_u_t(p->quota_max_objects) << " objects";
+ rs << " (current num objects: " << sum.num_objects << " objects)";
+ }
+ rs << "\n"
+ << " max bytes : ";
+ if (p->quota_max_bytes == 0)
+ rs << "N/A";
+ else {
+ rs << byte_u_t(p->quota_max_bytes);
+ rs << " (current num bytes: " << sum.num_bytes << " bytes)";
+ }
+ rdata.append(rs.str());
+ }
+ rdata.append("\n");
+ r = 0;
+ } else if (prefix == "osd crush rule list" ||
+ prefix == "osd crush rule ls") {
+ if (f) {
+ f->open_array_section("rules");
+ osdmap.crush->list_rules(f.get());
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ostringstream ss;
+ osdmap.crush->list_rules(&ss);
+ rdata.append(ss.str());
+ }
+ } else if (prefix == "osd crush rule ls-by-class") {
+ string class_name;
+ cmd_getval(cmdmap, "class", class_name);
+ if (class_name.empty()) {
+ ss << "no class specified";
+ r = -EINVAL;
+ goto reply;
+ }
+ set<int> rules;
+ r = osdmap.crush->get_rules_by_class(class_name, &rules);
+ if (r < 0) {
+ ss << "failed to get rules by class '" << class_name << "'";
+ goto reply;
+ }
+ if (f) {
+ f->open_array_section("rules");
+ for (auto &rule: rules) {
+ f->dump_string("name", osdmap.crush->get_rule_name(rule));
+ }
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ostringstream rs;
+ for (auto &rule: rules) {
+ rs << osdmap.crush->get_rule_name(rule) << "\n";
+ }
+ rdata.append(rs.str());
+ }
+ } else if (prefix == "osd crush rule dump") {
+ string name;
+ cmd_getval(cmdmap, "name", name);
+ string format;
+ cmd_getval(cmdmap, "format", format);
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+ if (name == "") {
+ f->open_array_section("rules");
+ osdmap.crush->dump_rules(f.get());
+ f->close_section();
+ } else {
+ int ruleno = osdmap.crush->get_rule_id(name);
+ if (ruleno < 0) {
+ ss << "unknown crush rule '" << name << "'";
+ r = ruleno;
+ goto reply;
+ }
+ osdmap.crush->dump_rule(ruleno, f.get());
+ }
+ ostringstream rs;
+ f->flush(rs);
+ rs << "\n";
+ rdata.append(rs.str());
+ } else if (prefix == "osd crush dump") {
+ string format;
+ cmd_getval(cmdmap, "format", format);
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+ f->open_object_section("crush_map");
+ osdmap.crush->dump(f.get());
+ f->close_section();
+ ostringstream rs;
+ f->flush(rs);
+ rs << "\n";
+ rdata.append(rs.str());
+ } else if (prefix == "osd crush show-tunables") {
+ string format;
+ cmd_getval(cmdmap, "format", format);
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+ f->open_object_section("crush_map_tunables");
+ osdmap.crush->dump_tunables(f.get());
+ f->close_section();
+ ostringstream rs;
+ f->flush(rs);
+ rs << "\n";
+ rdata.append(rs.str());
+ } else if (prefix == "osd crush tree") {
+ bool show_shadow = false;
+ if (!cmd_getval_compat_cephbool(cmdmap, "show_shadow", show_shadow)) {
+ std::string shadow;
+ if (cmd_getval(cmdmap, "shadow", shadow) &&
+ shadow == "--show-shadow") {
+ show_shadow = true;
+ }
+ }
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+ if (f) {
+ f->open_object_section("crush_tree");
+ osdmap.crush->dump_tree(nullptr,
+ f.get(),
+ osdmap.get_pool_names(),
+ show_shadow);
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ostringstream ss;
+ osdmap.crush->dump_tree(&ss,
+ nullptr,
+ osdmap.get_pool_names(),
+ show_shadow);
+ rdata.append(ss.str());
+ }
+ } else if (prefix == "osd crush ls") {
+ string name;
+ if (!cmd_getval(cmdmap, "node", name)) {
+ ss << "no node specified";
+ r = -EINVAL;
+ goto reply;
+ }
+ if (!osdmap.crush->name_exists(name)) {
+ ss << "node '" << name << "' does not exist";
+ r = -ENOENT;
+ goto reply;
+ }
+ int id = osdmap.crush->get_item_id(name);
+ list<int> result;
+ if (id >= 0) {
+ result.push_back(id);
+ } else {
+ int num = osdmap.crush->get_bucket_size(id);
+ for (int i = 0; i < num; ++i) {
+ result.push_back(osdmap.crush->get_bucket_item(id, i));
+ }
+ }
+ if (f) {
+ f->open_array_section("items");
+ for (auto i : result) {
+ f->dump_string("item", osdmap.crush->get_item_name(i));
+ }
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ostringstream ss;
+ for (auto i : result) {
+ ss << osdmap.crush->get_item_name(i) << "\n";
+ }
+ rdata.append(ss.str());
+ }
+ r = 0;
+ } else if (prefix == "osd crush class ls") {
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+ f->open_array_section("crush_classes");
+ for (auto i : osdmap.crush->class_name)
+ f->dump_string("class", i.second);
+ f->close_section();
+ f->flush(rdata);
+ } else if (prefix == "osd crush class ls-osd") {
+ string name;
+ cmd_getval(cmdmap, "class", name);
+ set<int> osds;
+ osdmap.crush->get_devices_by_class(name, &osds);
+ if (f) {
+ f->open_array_section("osds");
+ for (auto &osd: osds)
+ f->dump_int("osd", osd);
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ bool first = true;
+ for (auto &osd : osds) {
+ if (!first)
+ ds << "\n";
+ first = false;
+ ds << osd;
+ }
+ rdata.append(ds);
+ }
+ } else if (prefix == "osd crush get-device-class") {
+ vector<string> idvec;
+ cmd_getval(cmdmap, "ids", idvec);
+ map<int, string> class_by_osd;
+ for (auto& id : idvec) {
+ ostringstream ts;
+ long osd = parse_osd_id(id.c_str(), &ts);
+ if (osd < 0) {
+ ss << "unable to parse osd id:'" << id << "'";
+ r = -EINVAL;
+ goto reply;
+ }
+ auto device_class = osdmap.crush->get_item_class(osd);
+ if (device_class)
+ class_by_osd[osd] = device_class;
+ else
+ class_by_osd[osd] = ""; // no class
+ }
+ if (f) {
+ f->open_array_section("osd_device_classes");
+ for (auto& i : class_by_osd) {
+ f->open_object_section("osd_device_class");
+ f->dump_int("osd", i.first);
+ f->dump_string("device_class", i.second);
+ f->close_section();
+ }
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ if (class_by_osd.size() == 1) {
+ // for single input, make a clean output
+ ds << class_by_osd.begin()->second;
+ } else {
+ // note that we do not group osds by class here
+ for (auto it = class_by_osd.begin();
+ it != class_by_osd.end();
+ it++) {
+ ds << "osd." << it->first << ' ' << it->second;
+ if (next(it) != class_by_osd.end())
+ ds << '\n';
+ }
+ }
+ rdata.append(ds);
+ }
+ } else if (prefix == "osd erasure-code-profile ls") {
+ const auto &profiles = osdmap.get_erasure_code_profiles();
+ if (f)
+ f->open_array_section("erasure-code-profiles");
+ for (auto i = profiles.begin(); i != profiles.end(); ++i) {
+ if (f)
+ f->dump_string("profile", i->first.c_str());
+ else
+ rdata.append(i->first + "\n");
+ }
+ if (f) {
+ f->close_section();
+ ostringstream rs;
+ f->flush(rs);
+ rs << "\n";
+ rdata.append(rs.str());
+ }
+ } else if (prefix == "osd crush weight-set ls") {
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+ if (f) {
+ f->open_array_section("weight_sets");
+ if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
+ f->dump_string("pool", "(compat)");
+ }
+ for (auto& i : osdmap.crush->choose_args) {
+ if (i.first >= 0) {
+ f->dump_string("pool", osdmap.get_pool_name(i.first));
+ }
+ }
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ostringstream rs;
+ if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
+ rs << "(compat)\n";
+ }
+ for (auto& i : osdmap.crush->choose_args) {
+ if (i.first >= 0) {
+ rs << osdmap.get_pool_name(i.first) << "\n";
+ }
+ }
+ rdata.append(rs.str());
+ }
+ } else if (prefix == "osd crush weight-set dump") {
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
+ "json-pretty"));
+ osdmap.crush->dump_choose_args(f.get());
+ f->flush(rdata);
+ } else if (prefix == "osd erasure-code-profile get") {
+ string name;
+ cmd_getval(cmdmap, "name", name);
+ if (!osdmap.has_erasure_code_profile(name)) {
+ ss << "unknown erasure code profile '" << name << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+ const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
+ if (f)
+ f->open_object_section("profile");
+ for (map<string,string>::const_iterator i = profile.begin();
+ i != profile.end();
+ ++i) {
+ if (f)
+ f->dump_string(i->first.c_str(), i->second.c_str());
+ else
+ rdata.append(i->first + "=" + i->second + "\n");
+ }
+ if (f) {
+ f->close_section();
+ ostringstream rs;
+ f->flush(rs);
+ rs << "\n";
+ rdata.append(rs.str());
+ }
+ } else if (prefix == "osd pool application get") {
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
+ "json-pretty"));
+ string pool_name;
+ cmd_getval(cmdmap, "pool", pool_name);
+ string app;
+ cmd_getval(cmdmap, "app", app);
+ string key;
+ cmd_getval(cmdmap, "key", key);
+
+ if (pool_name.empty()) {
+ // all
+ f->open_object_section("pools");
+ for (const auto &pool : osdmap.pools) {
+ std::string name("<unknown>");
+ const auto &pni = osdmap.pool_name.find(pool.first);
+ if (pni != osdmap.pool_name.end())
+ name = pni->second;
+ f->open_object_section(name.c_str());
+ for (auto &app_pair : pool.second.application_metadata) {
+ f->open_object_section(app_pair.first.c_str());
+ for (auto &kv_pair : app_pair.second) {
+ f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+ }
+ f->close_section();
+ }
+ f->close_section(); // name
+ }
+ f->close_section(); // pools
+ f->flush(rdata);
+ } else {
+ int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
+ if (pool < 0) {
+ ss << "unrecognized pool '" << pool_name << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+ auto p = osdmap.get_pg_pool(pool);
+ // filter by pool
+ if (app.empty()) {
+ f->open_object_section(pool_name.c_str());
+ for (auto &app_pair : p->application_metadata) {
+ f->open_object_section(app_pair.first.c_str());
+ for (auto &kv_pair : app_pair.second) {
+ f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+ }
+ f->close_section(); // application
+ }
+ f->close_section(); // pool_name
+ f->flush(rdata);
+ goto reply;
+ }
+
+ auto app_it = p->application_metadata.find(app);
+ if (app_it == p->application_metadata.end()) {
+ ss << "pool '" << pool_name << "' has no application '" << app << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+ // filter by pool + app
+ if (key.empty()) {
+ f->open_object_section(app_it->first.c_str());
+ for (auto &kv_pair : app_it->second) {
+ f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+ }
+ f->close_section(); // application
+ f->flush(rdata);
+ goto reply;
+ }
+ // filter by pool + app + key
+ auto key_it = app_it->second.find(key);
+ if (key_it == app_it->second.end()) {
+ ss << "application '" << app << "' on pool '" << pool_name
+ << "' does not have key '" << key << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+ ss << key_it->second << "\n";
+ rdata.append(ss.str());
+ ss.str("");
+ }
+ } else if (prefix == "osd get-require-min-compat-client") {
+ ss << osdmap.require_min_compat_client << std::endl;
+ rdata.append(ss.str());
+ ss.str("");
+ goto reply;
+ } else if (prefix == "osd pool application enable" ||
+ prefix == "osd pool application disable" ||
+ prefix == "osd pool application set" ||
+ prefix == "osd pool application rm") {
+ bool changed = false;
+ r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
+ if (r != 0) {
+ // Error, reply.
+ goto reply;
+ } else if (changed) {
+ // Valid mutation, proceed to prepare phase
+ return false;
+ } else {
+ // Idempotent case, reply
+ goto reply;
+ }
+ } else {
+ // try prepare update
+ return false;
+ }
+
+ reply:
+ string rs;
+ getline(ss, rs);
+ mon.reply_command(op, r, rs, rdata, get_last_committed());
+ return true;
+}
+
+void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
+{
+ pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
+ osdmap.get_pg_pool(pool_id));
+ ceph_assert(pool);
+ pool->set_flag(flags);
+}
+
+void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
+{
+ pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
+ osdmap.get_pg_pool(pool_id));
+ ceph_assert(pool);
+ pool->unset_flag(flags);
+}
+
+string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
+{
+ char k[80];
+ snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
+ return k;
+}
+
+string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
+{
+ char k[80];
+ snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
+ (unsigned long long)pool, (unsigned long long)snap);
+ return k;
+}
+
+string OSDMonitor::make_purged_snap_key_value(
+ int64_t pool, snapid_t snap, snapid_t num,
+ epoch_t epoch, bufferlist *v)
+{
+ // encode the *last* epoch in the key so that we can use forward
+ // iteration only to search for an epoch in an interval.
+ encode(snap, *v);
+ encode(snap + num, *v);
+ encode(epoch, *v);
+ return make_purged_snap_key(pool, snap + num - 1);
+}
+
+
+int OSDMonitor::lookup_purged_snap(
+ int64_t pool, snapid_t snap,
+ snapid_t *begin, snapid_t *end)
+{
+ string k = make_purged_snap_key(pool, snap);
+ auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
+ it->lower_bound(k);
+ if (!it->valid()) {
+ dout(20) << __func__
+ << " pool " << pool << " snap " << snap
+ << " - key '" << k << "' not found" << dendl;
+ return -ENOENT;
+ }
+ if (it->key().find("purged_snap_") != 0) {
+ dout(20) << __func__
+ << " pool " << pool << " snap " << snap
+ << " - key '" << k << "' got '" << it->key()
+ << "', wrong prefix" << dendl;
+ return -ENOENT;
+ }
+ string gotk = it->key();
+ const char *format = "purged_snap_%llu_";
+ long long int keypool;
+ int n = sscanf(gotk.c_str(), format, &keypool);
+ if (n != 1) {
+ derr << __func__ << " invalid k '" << gotk << "'" << dendl;
+ return -ENOENT;
+ }
+ if (pool != keypool) {
+ dout(20) << __func__
+ << " pool " << pool << " snap " << snap
+ << " - key '" << k << "' got '" << gotk
+ << "', wrong pool " << keypool
+ << dendl;
+ return -ENOENT;
+ }
+ bufferlist v = it->value();
+ auto p = v.cbegin();
+ decode(*begin, p);
+ decode(*end, p);
+ if (snap < *begin || snap >= *end) {
+ dout(20) << __func__
+ << " pool " << pool << " snap " << snap
+ << " - found [" << *begin << "," << *end << "), no overlap"
+ << dendl;
+ return -ENOENT;
+ }
+ return 0;
+}
+
+void OSDMonitor::insert_purged_snap_update(
+ int64_t pool,
+ snapid_t start, snapid_t end,
+ epoch_t epoch,
+ MonitorDBStore::TransactionRef t)
+{
+ snapid_t before_begin, before_end;
+ snapid_t after_begin, after_end;
+ int b = lookup_purged_snap(pool, start - 1,
+ &before_begin, &before_end);
+ int a = lookup_purged_snap(pool, end,
+ &after_begin, &after_end);
+ if (!b && !a) {
+ dout(10) << __func__
+ << " [" << start << "," << end << ") - joins ["
+ << before_begin << "," << before_end << ") and ["
+ << after_begin << "," << after_end << ")" << dendl;
+ // erase only the begin record; we'll overwrite the end one.
+ t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
+ bufferlist v;
+ string k = make_purged_snap_key_value(pool,
+ before_begin, after_end - before_begin,
+ pending_inc.epoch, &v);
+ t->put(OSD_SNAP_PREFIX, k, v);
+ } else if (!b) {
+ dout(10) << __func__
+ << " [" << start << "," << end << ") - join with earlier ["
+ << before_begin << "," << before_end << ")" << dendl;
+ t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
+ bufferlist v;
+ string k = make_purged_snap_key_value(pool,
+ before_begin, end - before_begin,
+ pending_inc.epoch, &v);
+ t->put(OSD_SNAP_PREFIX, k, v);
+ } else if (!a) {
+ dout(10) << __func__
+ << " [" << start << "," << end << ") - join with later ["
+ << after_begin << "," << after_end << ")" << dendl;
+ // overwrite after record
+ bufferlist v;
+ string k = make_purged_snap_key_value(pool,
+ start, after_end - start,
+ pending_inc.epoch, &v);
+ t->put(OSD_SNAP_PREFIX, k, v);
+ } else {
+ dout(10) << __func__
+ << " [" << start << "," << end << ") - new"
+ << dendl;
+ bufferlist v;
+ string k = make_purged_snap_key_value(pool,
+ start, end - start,
+ pending_inc.epoch, &v);
+ t->put(OSD_SNAP_PREFIX, k, v);
+ }
+}
+
+bool OSDMonitor::try_prune_purged_snaps()
+{
+ if (!mon.mgrstatmon()->is_readable()) {
+ return false;
+ }
+ if (!pending_inc.new_purged_snaps.empty()) {
+ return false; // we already pruned for this epoch
+ }
+
+ unsigned max_prune = cct->_conf.get_val<uint64_t>(
+ "mon_max_snap_prune_per_epoch");
+ if (!max_prune) {
+ max_prune = 100000;
+ }
+ dout(10) << __func__ << " max_prune " << max_prune << dendl;
+
+ unsigned actually_pruned = 0;
+ auto& purged_snaps = mon.mgrstatmon()->get_digest().purged_snaps;
+ for (auto& p : osdmap.get_pools()) {
+ auto q = purged_snaps.find(p.first);
+ if (q == purged_snaps.end()) {
+ continue;
+ }
+ auto& purged = q->second;
+ if (purged.empty()) {
+ dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
+ continue;
+ }
+ dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
+ snap_interval_set_t to_prune;
+ unsigned maybe_pruned = actually_pruned;
+ for (auto i = purged.begin(); i != purged.end(); ++i) {
+ snapid_t begin = i.get_start();
+ auto end = i.get_start() + i.get_len();
+ snapid_t pbegin = 0, pend = 0;
+ int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
+ if (r == 0) {
+ // already purged.
+ // be a bit aggressive about backing off here, because the mon may
+ // do a lot of work going through this set, and if we know the
+ // purged set from the OSDs is at least *partly* stale we may as
+ // well wait for it to be fresh.
+ dout(20) << __func__ << " we've already purged " << pbegin
+ << "~" << (pend - pbegin) << dendl;
+ break; // next pool
+ }
+ if (pbegin && pbegin > begin && pbegin < end) {
+ // the tail of [begin,end) is purged; shorten the range
+ end = pbegin;
+ }
+ to_prune.insert(begin, end - begin);
+ maybe_pruned += end - begin;
+ if (maybe_pruned >= max_prune) {
+ break;
+ }
+ }
+ if (!to_prune.empty()) {
+ // PGs may still be reporting things as purged that we have already
+ // pruned from removed_snaps_queue.
+ snap_interval_set_t actual;
+ auto r = osdmap.removed_snaps_queue.find(p.first);
+ if (r != osdmap.removed_snaps_queue.end()) {
+ actual.intersection_of(to_prune, r->second);
+ }
+ actually_pruned += actual.size();
+ dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
+ << ", actual pruned " << actual << dendl;
+ if (!actual.empty()) {
+ pending_inc.new_purged_snaps[p.first].swap(actual);
+ }
+ }
+ if (actually_pruned >= max_prune) {
+ break;
+ }
+ }
+ dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
+ return !!actually_pruned;
+}
+
+bool OSDMonitor::update_pools_status()
+{
+ if (!mon.mgrstatmon()->is_readable())
+ return false;
+
+ bool ret = false;
+
+ auto& pools = osdmap.get_pools();
+ for (auto it = pools.begin(); it != pools.end(); ++it) {
+ const pool_stat_t *pstat = mon.mgrstatmon()->get_pool_stat(it->first);
+ if (!pstat)
+ continue;
+ const object_stat_sum_t& sum = pstat->stats.sum;
+ const pg_pool_t &pool = it->second;
+ const string& pool_name = osdmap.get_pool_name(it->first);
+
+ bool pool_is_full =
+ (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
+ (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
+
+ if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+ if (pool_is_full)
+ continue;
+
+ mon.clog->info() << "pool '" << pool_name
+ << "' no longer out of quota; removing NO_QUOTA flag";
+ // below we cancel FLAG_FULL too, we'll set it again in
+ // OSDMonitor::encode_pending if it still fails the osd-full checking.
+ clear_pool_flags(it->first,
+ pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
+ ret = true;
+ } else {
+ if (!pool_is_full)
+ continue;
+
+ if (pool.quota_max_bytes > 0 &&
+ (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
+ mon.clog->warn() << "pool '" << pool_name << "' is full"
+ << " (reached quota's max_bytes: "
+ << byte_u_t(pool.quota_max_bytes) << ")";
+ }
+ if (pool.quota_max_objects > 0 &&
+ (uint64_t)sum.num_objects >= pool.quota_max_objects) {
+ mon.clog->warn() << "pool '" << pool_name << "' is full"
+ << " (reached quota's max_objects: "
+ << pool.quota_max_objects << ")";
+ }
+ // set both FLAG_FULL_QUOTA and FLAG_FULL
+ // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
+ // since FLAG_FULL should always take precedence
+ set_pool_flags(it->first,
+ pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
+ clear_pool_flags(it->first,
+ pg_pool_t::FLAG_NEARFULL |
+ pg_pool_t::FLAG_BACKFILLFULL);
+ ret = true;
+ }
+ }
+ return ret;
+}
+
+int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MPoolOp>();
+ dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
+ MonSession *session = op->get_session();
+ if (!session)
+ return -EPERM;
+ string erasure_code_profile;
+ stringstream ss;
+ string rule_name;
+ bool bulk = false;
+ int ret = 0;
+ ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
+ 0, 0, 0, 0, 0, 0, 0.0,
+ erasure_code_profile,
+ pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {}, bulk,
+ cct->_conf.get_val<bool>("osd_pool_default_crimson"),
+ &ss);
+
+ if (ret < 0) {
+ dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
+ }
+ return ret;
+}
+
+int OSDMonitor::crush_rename_bucket(const string& srcname,
+ const string& dstname,
+ ostream *ss)
+{
+ int ret;
+ //
+ // Avoid creating a pending crush if it does not already exists and
+ // the rename would fail.
+ //
+ if (!_have_pending_crush()) {
+ ret = _get_stable_crush().can_rename_bucket(srcname,
+ dstname,
+ ss);
+ if (ret)
+ return ret;
+ }
+
+ CrushWrapper newcrush = _get_pending_crush();
+
+ ret = newcrush.rename_bucket(srcname,
+ dstname,
+ ss);
+ if (ret)
+ return ret;
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ *ss << "renamed bucket " << srcname << " into " << dstname;
+ return 0;
+}
+
+void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
+{
+ string replacement = "";
+
+ if (plugin == "jerasure_generic" ||
+ plugin == "jerasure_sse3" ||
+ plugin == "jerasure_sse4" ||
+ plugin == "jerasure_neon") {
+ replacement = "jerasure";
+ } else if (plugin == "shec_generic" ||
+ plugin == "shec_sse3" ||
+ plugin == "shec_sse4" ||
+ plugin == "shec_neon") {
+ replacement = "shec";
+ }
+
+ if (replacement != "") {
+ dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
+ << plugin << " that has been deprecated. Please use "
+ << replacement << " instead." << dendl;
+ }
+}
+
+int OSDMonitor::normalize_profile(const string& profilename,
+ ErasureCodeProfile &profile,
+ bool force,
+ ostream *ss)
+{
+ ErasureCodeInterfaceRef erasure_code;
+ ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+ ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
+ check_legacy_ec_plugin(plugin->second, profilename);
+ int err = instance.factory(plugin->second,
+ g_conf().get_val<std::string>("erasure_code_dir"),
+ profile, &erasure_code, ss);
+ if (err) {
+ return err;
+ }
+
+ err = erasure_code->init(profile, ss);
+ if (err) {
+ return err;
+ }
+
+ auto it = profile.find("stripe_unit");
+ if (it != profile.end()) {
+ string err_str;
+ uint32_t stripe_unit = strict_iecstrtoll(it->second, &err_str);
+ if (!err_str.empty()) {
+ *ss << "could not parse stripe_unit '" << it->second
+ << "': " << err_str << std::endl;
+ return -EINVAL;
+ }
+ uint32_t data_chunks = erasure_code->get_data_chunk_count();
+ uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
+ if (chunk_size != stripe_unit) {
+ *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
+ << "alignment. Would be padded to " << chunk_size
+ << std::endl;
+ return -EINVAL;
+ }
+ if ((stripe_unit % 4096) != 0 && !force) {
+ *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
+ << "use --force to override this check" << std::endl;
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+int OSDMonitor::crush_rule_create_erasure(const string &name,
+ const string &profile,
+ int *rule,
+ ostream *ss)
+{
+ int ruleid = osdmap.crush->get_rule_id(name);
+ if (ruleid != -ENOENT) {
+ *rule = ruleid;
+ return -EEXIST;
+ }
+
+ CrushWrapper newcrush = _get_pending_crush();
+
+ ruleid = newcrush.get_rule_id(name);
+ if (ruleid != -ENOENT) {
+ *rule = ruleid;
+ return -EALREADY;
+ } else {
+ ErasureCodeInterfaceRef erasure_code;
+ int err = get_erasure_code(profile, &erasure_code, ss);
+ if (err) {
+ *ss << "failed to load plugin using profile " << profile << std::endl;
+ return err;
+ }
+
+ err = erasure_code->create_rule(name, newcrush, ss);
+ erasure_code.reset();
+ if (err < 0)
+ return err;
+
+ if (!validate_crush_against_features(&newcrush, *ss)) {
+ return -EINVAL;
+ }
+
+ *rule = err;
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ return 0;
+ }
+}
+
+int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
+ ErasureCodeInterfaceRef *erasure_code,
+ ostream *ss) const
+{
+ if (pending_inc.has_erasure_code_profile(erasure_code_profile))
+ return -EAGAIN;
+ ErasureCodeProfile profile =
+ osdmap.get_erasure_code_profile(erasure_code_profile);
+ ErasureCodeProfile::const_iterator plugin =
+ profile.find("plugin");
+ if (plugin == profile.end()) {
+ *ss << "cannot determine the erasure code plugin"
+ << " because there is no 'plugin' entry in the erasure_code_profile "
+ << profile << std::endl;
+ return -EINVAL;
+ }
+ check_legacy_ec_plugin(plugin->second, erasure_code_profile);
+ auto& instance = ErasureCodePluginRegistry::instance();
+ return instance.factory(plugin->second,
+ g_conf().get_val<std::string>("erasure_code_dir"),
+ profile, erasure_code, ss);
+}
+
+int OSDMonitor::check_cluster_features(uint64_t features,
+ stringstream &ss)
+{
+ stringstream unsupported_ss;
+ int unsupported_count = 0;
+ if ((mon.get_quorum_con_features() & features) != features) {
+ unsupported_ss << "the monitor cluster";
+ ++unsupported_count;
+ }
+
+ set<int32_t> up_osds;
+ osdmap.get_up_osds(up_osds);
+ for (set<int32_t>::iterator it = up_osds.begin();
+ it != up_osds.end(); ++it) {
+ const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
+ if ((xi.features & features) != features) {
+ if (unsupported_count > 0)
+ unsupported_ss << ", ";
+ unsupported_ss << "osd." << *it;
+ unsupported_count ++;
+ }
+ }
+
+ if (unsupported_count > 0) {
+ ss << "features " << features << " unsupported by: "
+ << unsupported_ss.str();
+ return -ENOTSUP;
+ }
+
+ // check pending osd state, too!
+ for (map<int32_t,osd_xinfo_t>::const_iterator p =
+ pending_inc.new_xinfo.begin();
+ p != pending_inc.new_xinfo.end(); ++p) {
+ const osd_xinfo_t &xi = p->second;
+ if ((xi.features & features) != features) {
+ dout(10) << __func__ << " pending osd." << p->first
+ << " features are insufficient; retry" << dendl;
+ return -EAGAIN;
+ }
+ }
+
+ return 0;
+}
+
+bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
+ ostream &ss)
+{
+ OSDMap::Incremental new_pending = pending_inc;
+ encode(*newcrush, new_pending.crush, mon.get_quorum_con_features());
+ OSDMap newmap;
+ newmap.deepish_copy_from(osdmap);
+ newmap.apply_incremental(new_pending);
+
+ // client compat
+ if (newmap.require_min_compat_client != ceph_release_t::unknown) {
+ auto mv = newmap.get_min_compat_client();
+ if (mv > newmap.require_min_compat_client) {
+ ss << "new crush map requires client version " << mv
+ << " but require_min_compat_client is "
+ << newmap.require_min_compat_client;
+ return false;
+ }
+ }
+
+ // osd compat
+ uint64_t features =
+ newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
+ newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
+ stringstream features_ss;
+ int r = check_cluster_features(features, features_ss);
+ if (r) {
+ ss << "Could not change CRUSH: " << features_ss.str();
+ return false;
+ }
+
+ return true;
+}
+
+bool OSDMonitor::erasure_code_profile_in_use(
+ const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
+ const string &profile,
+ ostream *ss)
+{
+ bool found = false;
+ for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
+ p != pools.end();
+ ++p) {
+ if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
+ *ss << osdmap.pool_name[p->first] << " ";
+ found = true;
+ }
+ }
+ if (found) {
+ *ss << "pool(s) are using the erasure code profile '" << profile << "'";
+ }
+ return found;
+}
+
+int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
+ map<string,string> *erasure_code_profile_map,
+ ostream *ss)
+{
+ int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
+ get_json_str_map,
+ *ss,
+ erasure_code_profile_map,
+ true);
+ if (r)
+ return r;
+ ceph_assert((*erasure_code_profile_map).count("plugin"));
+ string default_plugin = (*erasure_code_profile_map)["plugin"];
+ map<string,string> user_map;
+ for (vector<string>::const_iterator i = erasure_code_profile.begin();
+ i != erasure_code_profile.end();
+ ++i) {
+ size_t equal = i->find('=');
+ if (equal == string::npos) {
+ user_map[*i] = string();
+ (*erasure_code_profile_map)[*i] = string();
+ } else {
+ const string key = i->substr(0, equal);
+ equal++;
+ const string value = i->substr(equal);
+ if (key.find("ruleset-") == 0) {
+ *ss << "property '" << key << "' is no longer supported; try "
+ << "'crush-" << key.substr(8) << "' instead";
+ return -EINVAL;
+ }
+ user_map[key] = value;
+ (*erasure_code_profile_map)[key] = value;
+ }
+ }
+
+ if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
+ (*erasure_code_profile_map) = user_map;
+
+ return 0;
+}
+
+int OSDMonitor::prepare_pool_size(const unsigned pool_type,
+ const string &erasure_code_profile,
+ uint8_t repl_size,
+ unsigned *size, unsigned *min_size,
+ ostream *ss)
+{
+ int err = 0;
+ bool set_min_size = false;
+ switch (pool_type) {
+ case pg_pool_t::TYPE_REPLICATED:
+ if (osdmap.stretch_mode_enabled) {
+ if (repl_size == 0)
+ repl_size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
+ if (repl_size != g_conf().get_val<uint64_t>("mon_stretch_pool_size")) {
+ *ss << "prepare_pool_size: we are in stretch mode but size "
+ << repl_size << " does not match!";
+ return -EINVAL;
+ }
+ *min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
+ set_min_size = true;
+ }
+ if (repl_size == 0) {
+ repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
+ }
+ *size = repl_size;
+ if (!set_min_size)
+ *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
+ break;
+ case pg_pool_t::TYPE_ERASURE:
+ {
+ if (osdmap.stretch_mode_enabled) {
+ *ss << "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
+ return -EINVAL;
+ }
+ ErasureCodeInterfaceRef erasure_code;
+ err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
+ if (err == 0) {
+ *size = erasure_code->get_chunk_count();
+ *min_size =
+ erasure_code->get_data_chunk_count() +
+ std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
+ assert(*min_size <= *size);
+ assert(*min_size >= erasure_code->get_data_chunk_count());
+ }
+ }
+ break;
+ default:
+ *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
+ err = -EINVAL;
+ break;
+ }
+ return err;
+}
+
+int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
+ const string &erasure_code_profile,
+ uint32_t *stripe_width,
+ ostream *ss)
+{
+ int err = 0;
+ switch (pool_type) {
+ case pg_pool_t::TYPE_REPLICATED:
+ // ignored
+ break;
+ case pg_pool_t::TYPE_ERASURE:
+ {
+ ErasureCodeProfile profile =
+ osdmap.get_erasure_code_profile(erasure_code_profile);
+ ErasureCodeInterfaceRef erasure_code;
+ err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
+ if (err)
+ break;
+ uint32_t data_chunks = erasure_code->get_data_chunk_count();
+ uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
+ auto it = profile.find("stripe_unit");
+ if (it != profile.end()) {
+ string err_str;
+ stripe_unit = strict_iecstrtoll(it->second, &err_str);
+ ceph_assert(err_str.empty());
+ }
+ *stripe_width = data_chunks *
+ erasure_code->get_chunk_size(stripe_unit * data_chunks);
+ }
+ break;
+ default:
+ *ss << "prepare_pool_stripe_width: "
+ << pool_type << " is not a known pool type";
+ err = -EINVAL;
+ break;
+ }
+ return err;
+}
+
+int OSDMonitor::get_replicated_stretch_crush_rule()
+{
+ /* we don't write down the stretch rule anywhere, so
+ * we have to guess it. How? Look at all the pools
+ * and count up how many times a given rule is used
+ * on stretch pools and then return the one with
+ * the most users!
+ */
+ map<int,int> rule_counts;
+ for (const auto& pooli : osdmap.pools) {
+ const pg_pool_t& p = pooli.second;
+ if (p.is_replicated() && p.is_stretch_pool()) {
+ if (!rule_counts.count(p.crush_rule)) {
+ rule_counts[p.crush_rule] = 1;
+ } else {
+ ++rule_counts[p.crush_rule];
+ }
+ }
+ }
+
+ if (rule_counts.empty()) {
+ return -ENOENT;
+ }
+
+ int most_used_count = 0;
+ int most_used_rule = -1;
+ for (auto i : rule_counts) {
+ if (i.second > most_used_count) {
+ most_used_rule = i.first;
+ most_used_count = i.second;
+ }
+ }
+ ceph_assert(most_used_count > 0);
+ ceph_assert(most_used_rule >= 0);
+ return most_used_rule;
+}
+
+int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
+ const string &erasure_code_profile,
+ const string &rule_name,
+ int *crush_rule,
+ ostream *ss)
+{
+
+ if (*crush_rule < 0) {
+ switch (pool_type) {
+ case pg_pool_t::TYPE_REPLICATED:
+ {
+ if (rule_name == "") {
+ if (osdmap.stretch_mode_enabled) {
+ *crush_rule = get_replicated_stretch_crush_rule();
+ } else {
+ // Use default rule
+ *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_rule(cct);
+ }
+ if (*crush_rule < 0) {
+ // Errors may happen e.g. if no valid rule is available
+ *ss << "No suitable CRUSH rule exists, check "
+ << "'osd pool default crush *' config options";
+ return -ENOENT;
+ }
+ } else {
+ return get_crush_rule(rule_name, crush_rule, ss);
+ }
+ }
+ break;
+ case pg_pool_t::TYPE_ERASURE:
+ {
+ int err = crush_rule_create_erasure(rule_name,
+ erasure_code_profile,
+ crush_rule, ss);
+ switch (err) {
+ case -EALREADY:
+ dout(20) << "prepare_pool_crush_rule: rule "
+ << rule_name << " try again" << dendl;
+ // fall through
+ case 0:
+ // need to wait for the crush rule to be proposed before proceeding
+ err = -EAGAIN;
+ break;
+ case -EEXIST:
+ err = 0;
+ break;
+ }
+ return err;
+ }
+ break;
+ default:
+ *ss << "prepare_pool_crush_rule: " << pool_type
+ << " is not a known pool type";
+ return -EINVAL;
+ }
+ } else {
+ if (!osdmap.crush->rule_exists(*crush_rule)) {
+ *ss << "CRUSH rule " << *crush_rule << " not found";
+ return -ENOENT;
+ }
+ }
+
+ return 0;
+}
+
+int OSDMonitor::get_crush_rule(const string &rule_name,
+ int *crush_rule,
+ ostream *ss)
+{
+ int ret;
+ ret = osdmap.crush->get_rule_id(rule_name);
+ if (ret != -ENOENT) {
+ // found it, use it
+ *crush_rule = ret;
+ } else {
+ CrushWrapper newcrush = _get_pending_crush();
+
+ ret = newcrush.get_rule_id(rule_name);
+ if (ret != -ENOENT) {
+ // found it, wait for it to be proposed
+ dout(20) << __func__ << ": rule " << rule_name
+ << " try again" << dendl;
+ return -EAGAIN;
+ } else {
+ // Cannot find it , return error
+ *ss << "specified rule " << rule_name << " doesn't exist";
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/*
+* Get the number of 'in' osds according to the crush_rule,
+*/
+uint32_t OSDMonitor::get_osd_num_by_crush(int crush_rule)
+{
+ set<int> out_osds;
+ set<int> crush_in_osds;
+ set<int> roots;
+ CrushWrapper newcrush = _get_pending_crush();
+ newcrush.find_takes_by_rule(crush_rule, &roots);
+ for (auto root : roots) {
+ const char *rootname = newcrush.get_item_name(root);
+ set<int> crush_all_osds;
+ newcrush.get_leaves(rootname, &crush_all_osds);
+ std::set_difference(crush_all_osds.begin(), crush_all_osds.end(),
+ out_osds.begin(), out_osds.end(),
+ std::inserter(crush_in_osds, crush_in_osds.end()));
+ }
+ return crush_in_osds.size();
+}
+
+int OSDMonitor::check_pg_num(int64_t pool,
+ int pg_num,
+ int size,
+ int crush_rule,
+ ostream *ss)
+{
+ auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
+ uint64_t projected = 0;
+ uint32_t osd_num_by_crush = 0;
+ set<int64_t> crush_pool_ids;
+ if (pool < 0) {
+ // a new pool
+ projected += pg_num * size;
+ }
+
+ osd_num_by_crush = get_osd_num_by_crush(crush_rule);
+ osdmap.get_pool_ids_by_rule(crush_rule, &crush_pool_ids);
+
+ for (const auto& [pool_id, pool_info] : osdmap.get_pools()) {
+ // Check only for pools affected by crush rule
+ if (crush_pool_ids.contains(pool_id)) {
+ if (pool_id == pool) {
+ // Specified pool, use given pg_num and size values.
+ projected += pg_num * size;
+ } else {
+ // Use pg_num_target for evaluating the projected pg num
+ projected += pool_info.get_pg_num_target() * pool_info.get_size();
+ }
+ }
+ }
+ // assume min cluster size 3
+ osd_num_by_crush = std::max(osd_num_by_crush, 3u);
+ auto projected_pgs_per_osd = projected / osd_num_by_crush;
+
+ if (projected_pgs_per_osd > max_pgs_per_osd) {
+ if (pool >= 0) {
+ *ss << "pool id " << pool;
+ }
+ *ss << " pg_num " << pg_num
+ << " size " << size
+ << " for this pool would result in "
+ << projected_pgs_per_osd
+ << " cumulative PGs per OSD (" << projected
+ << " total PG replicas on " << osd_num_by_crush
+ << " 'in' root OSDs by crush rule) "
+ << "which exceeds the mon_max_pg_per_osd "
+ << "value of " << max_pgs_per_osd;
+ return -ERANGE;
+ }
+ return 0;
+}
+
+/**
+ * @param name The name of the new pool
+ * @param crush_rule The crush rule to use. If <0, will use the system default
+ * @param crush_rule_name The crush rule to use, if crush_rulset <0
+ * @param pg_num The pg_num to use. If set to 0, will use the system default
+ * @param pgp_num The pgp_num to use. If set to 0, will use the system default
+ * @param pg_num_min min pg_num
+ * @param pg_num_max max pg_num
+ * @param repl_size Replication factor, or 0 for default
+ * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
+ * @param pool_type TYPE_ERASURE, or TYPE_REP
+ * @param expected_num_objects expected number of objects on the pool
+ * @param fast_read fast read type.
+ * @param pg_autoscale_mode autoscale mode, one of on, off, warn
+ * @param bool bulk indicates whether pool should be a bulk pool
+ * @param bool crimson indicates whether pool is a crimson pool
+ * @param ss human readable error message, if any.
+ *
+ * @return 0 on success, negative errno on failure.
+ */
+int OSDMonitor::prepare_new_pool(string& name,
+ int crush_rule,
+ const string &crush_rule_name,
+ unsigned pg_num, unsigned pgp_num,
+ unsigned pg_num_min,
+ unsigned pg_num_max,
+ const uint64_t repl_size,
+ const uint64_t target_size_bytes,
+ const float target_size_ratio,
+ const string &erasure_code_profile,
+ const unsigned pool_type,
+ const uint64_t expected_num_objects,
+ FastReadType fast_read,
+ string pg_autoscale_mode,
+ bool bulk,
+ bool crimson,
+ ostream *ss)
+{
+ if (crimson && pg_autoscale_mode.empty()) {
+ // default pg_autoscale_mode to off for crimson, we'll error out below if
+ // the user tried to actually set pg_autoscale_mode to something other than
+ // "off"
+ pg_autoscale_mode = "off";
+ }
+
+ if (name.length() == 0)
+ return -EINVAL;
+
+ if (pg_num == 0) {
+ auto pg_num_from_mode =
+ [pg_num=g_conf().get_val<uint64_t>("osd_pool_default_pg_num")]
+ (const string& mode) {
+ return mode == "on" ? 1 : pg_num;
+ };
+ pg_num = pg_num_from_mode(
+ pg_autoscale_mode.empty() ?
+ g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode") :
+ pg_autoscale_mode);
+ }
+ if (pgp_num == 0)
+ pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
+ if (!pgp_num)
+ pgp_num = pg_num;
+ if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
+ *ss << "'pg_num' must be greater than 0 and less than or equal to "
+ << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
+ << " (you may adjust 'mon max pool pg num' for higher values)";
+ return -ERANGE;
+ }
+ if (pgp_num > pg_num) {
+ *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
+ << ", which in this case is " << pg_num;
+ return -ERANGE;
+ }
+
+ if (crimson) {
+ /* crimson-osd requires that the pool be replicated and that pg_num/pgp_num
+ * be static. User must also have specified set-allow-crimson */
+ const auto *suffix = " (--crimson specified or osd_pool_default_crimson set)";
+ if (pg_autoscale_mode != "off") {
+ *ss << "crimson-osd does not support changing pg_num or pgp_num, "
+ << "pg_autoscale_mode must be set to 'off'" << suffix;
+ return -EINVAL;
+ } else if (!osdmap.get_allow_crimson()) {
+ *ss << "set-allow-crimson must be set to create a pool with the "
+ << "crimson flag" << suffix;
+ return -EINVAL;
+ }
+ }
+
+ if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
+ *ss << "'fast_read' can only apply to erasure coding pool";
+ return -EINVAL;
+ }
+ int r;
+ r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
+ crush_rule_name, &crush_rule, ss);
+ if (r) {
+ dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
+ return r;
+ }
+ unsigned size, min_size;
+ r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
+ &size, &min_size, ss);
+ if (r) {
+ dout(10) << "prepare_pool_size returns " << r << dendl;
+ return r;
+ }
+ if (g_conf()->mon_osd_crush_smoke_test) {
+ CrushWrapper newcrush = _get_pending_crush();
+ ostringstream err;
+ CrushTester tester(newcrush, err);
+ tester.set_min_x(0);
+ tester.set_max_x(50);
+ tester.set_rule(crush_rule);
+ tester.set_num_rep(size);
+ auto start = ceph::coarse_mono_clock::now();
+ r = tester.test_with_fork(cct, g_conf()->mon_lease);
+ dout(10) << __func__ << " crush test_with_fork tester created " << dendl;
+ auto duration = ceph::coarse_mono_clock::now() - start;
+ if (r < 0) {
+ dout(10) << "tester.test_with_fork returns " << r
+ << ": " << err.str() << dendl;
+ *ss << "crush test failed with " << r << ": " << err.str();
+ return r;
+ }
+ dout(10) << __func__ << " crush smoke test duration: "
+ << duration << dendl;
+ }
+ r = check_pg_num(-1, pg_num, size, crush_rule, ss);
+ if (r) {
+ dout(10) << "check_pg_num returns " << r << dendl;
+ return r;
+ }
+
+ if (!osdmap.crush->rule_valid_for_pool_type(crush_rule, pool_type)) {
+ *ss << "crush rule " << crush_rule << " type does not match pool";
+ return -EINVAL;
+ }
+
+ uint32_t stripe_width = 0;
+ r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
+ if (r) {
+ dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
+ return r;
+ }
+
+ bool fread = false;
+ if (pool_type == pg_pool_t::TYPE_ERASURE) {
+ switch (fast_read) {
+ case FAST_READ_OFF:
+ fread = false;
+ break;
+ case FAST_READ_ON:
+ fread = true;
+ break;
+ case FAST_READ_DEFAULT:
+ fread = g_conf()->osd_pool_default_ec_fast_read;
+ break;
+ default:
+ *ss << "invalid fast_read setting: " << fast_read;
+ return -EINVAL;
+ }
+ }
+
+ for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
+ p != pending_inc.new_pool_names.end();
+ ++p) {
+ if (p->second == name)
+ return 0;
+ }
+
+ if (-1 == pending_inc.new_pool_max)
+ pending_inc.new_pool_max = osdmap.pool_max;
+ int64_t pool = ++pending_inc.new_pool_max;
+ pg_pool_t empty;
+ pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
+ pi->create_time = ceph_clock_now();
+ pi->type = pool_type;
+ pi->fast_read = fread;
+ pi->flags = g_conf()->osd_pool_default_flags;
+ if (bulk) {
+ pi->set_flag(pg_pool_t::FLAG_BULK);
+ } else if (g_conf()->osd_pool_default_flag_bulk) {
+ pi->set_flag(pg_pool_t::FLAG_BULK);
+ }
+ if (g_conf()->osd_pool_default_flag_hashpspool)
+ pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+ if (g_conf()->osd_pool_default_flag_nodelete)
+ pi->set_flag(pg_pool_t::FLAG_NODELETE);
+ if (g_conf()->osd_pool_default_flag_nopgchange)
+ pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
+ if (g_conf()->osd_pool_default_flag_nosizechange)
+ pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
+ pi->set_flag(pg_pool_t::FLAG_CREATING);
+ if (g_conf()->osd_pool_use_gmt_hitset)
+ pi->use_gmt_hitset = true;
+ else
+ pi->use_gmt_hitset = false;
+ if (crimson) {
+ pi->set_flag(pg_pool_t::FLAG_CRIMSON);
+ pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
+ }
+
+ pi->size = size;
+ pi->min_size = min_size;
+ pi->crush_rule = crush_rule;
+ pi->expected_num_objects = expected_num_objects;
+ pi->object_hash = CEPH_STR_HASH_RJENKINS;
+ if (osdmap.stretch_mode_enabled) {
+ pi->peering_crush_bucket_count = osdmap.stretch_bucket_count;
+ pi->peering_crush_bucket_target = osdmap.stretch_bucket_count;
+ pi->peering_crush_bucket_barrier = osdmap.stretch_mode_bucket;
+ pi->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
+ if (osdmap.degraded_stretch_mode) {
+ pi->peering_crush_bucket_count = osdmap.degraded_stretch_mode;
+ pi->peering_crush_bucket_target = osdmap.degraded_stretch_mode;
+ // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
+ // TODO: drat, we don't record this ^ anywhere, though given that it
+ // necessarily won't exist elsewhere it likely doesn't matter
+ pi->min_size = pi->min_size / 2;
+ pi->size = pi->size / 2; // only support 2 zones now
+ }
+ }
+
+ if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
+ g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
+ m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
+ pi->pg_autoscale_mode = m;
+ } else {
+ pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
+ }
+ auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
+ pi->set_pg_num(
+ max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
+ : pg_num);
+ pi->set_pg_num_pending(pi->get_pg_num());
+ pi->set_pg_num_target(pg_num);
+ pi->set_pgp_num(pi->get_pg_num());
+ pi->set_pgp_num_target(pgp_num);
+ if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
+ pg_num_min) {
+ pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
+ }
+ if (osdmap.require_osd_release >= ceph_release_t::quincy &&
+ pg_num_max) {
+ pi->opts.set(pool_opts_t::PG_NUM_MAX, static_cast<int64_t>(pg_num_max));
+ }
+ if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
+ pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
+ pi->pg_autoscale_mode = m;
+ }
+
+ pi->last_change = pending_inc.epoch;
+ pi->auid = 0;
+
+ if (pool_type == pg_pool_t::TYPE_ERASURE) {
+ pi->erasure_code_profile = erasure_code_profile;
+ } else {
+ pi->erasure_code_profile = "";
+ }
+ pi->stripe_width = stripe_width;
+
+ if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
+ target_size_bytes) {
+ // only store for nautilus+ because TARGET_SIZE_BYTES may be
+ // larger than int32_t max.
+ pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
+ }
+ if (target_size_ratio > 0.0 &&
+ osdmap.require_osd_release >= ceph_release_t::nautilus) {
+ // only store for nautilus+, just to be consistent and tidy.
+ pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
+ }
+
+ pi->cache_target_dirty_ratio_micro =
+ g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
+ pi->cache_target_dirty_high_ratio_micro =
+ g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
+ pi->cache_target_full_ratio_micro =
+ g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
+ pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
+ pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
+
+ pending_inc.new_pool_names[pool] = name;
+ return 0;
+}
+
+bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
+{
+ op->mark_osdmon_event(__func__);
+ ostringstream ss;
+ if (pending_inc.new_flags < 0)
+ pending_inc.new_flags = osdmap.get_flags();
+ pending_inc.new_flags |= flag;
+ ss << OSDMap::get_flag_string(flag) << " is set";
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+ get_last_committed() + 1));
+ return true;
+}
+
+bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
+{
+ op->mark_osdmon_event(__func__);
+ ostringstream ss;
+ if (pending_inc.new_flags < 0)
+ pending_inc.new_flags = osdmap.get_flags();
+ pending_inc.new_flags &= ~flag;
+ ss << OSDMap::get_flag_string(flag) << " is unset";
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+ get_last_committed() + 1));
+ return true;
+}
+
+int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
+ stringstream& ss)
+{
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+ if (pool < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ return -ENOENT;
+ }
+ string var;
+ cmd_getval(cmdmap, "var", var);
+
+ pg_pool_t p = *osdmap.get_pg_pool(pool);
+ if (pending_inc.new_pools.count(pool))
+ p = pending_inc.new_pools[pool];
+
+ // accept val as a json string in the normal case (current
+ // generation monitor). parse out int or float values from the
+ // string as needed. however, if it is not a string, try to pull
+ // out an int, in case an older monitor with an older json schema is
+ // forwarding a request.
+ string val;
+ string interr, floaterr;
+ int64_t n = 0;
+ double f = 0;
+ int64_t uf = 0; // micro-f
+ cmd_getval(cmdmap, "val", val);
+
+ auto si_options = {
+ "target_max_objects"
+ };
+ auto iec_options = {
+ "target_max_bytes",
+ "target_size_bytes",
+ "compression_max_blob_size",
+ "compression_min_blob_size",
+ "csum_max_block",
+ "csum_min_block",
+ };
+ if (count(begin(si_options), end(si_options), var)) {
+ n = strict_si_cast<int64_t>(val, &interr);
+ } else if (count(begin(iec_options), end(iec_options), var)) {
+ n = strict_iec_cast<int64_t>(val, &interr);
+ } else {
+ // parse string as both int and float; different fields use different types.
+ n = strict_strtoll(val.c_str(), 10, &interr);
+ f = strict_strtod(val.c_str(), &floaterr);
+ uf = llrintl(f * (double)1000000.0);
+ }
+
+ if (!p.is_tier() &&
+ (var == "hit_set_type" || var == "hit_set_period" ||
+ var == "hit_set_count" || var == "hit_set_fpp" ||
+ var == "target_max_objects" || var == "target_max_bytes" ||
+ var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
+ var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
+ var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
+ var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
+ var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
+ return -EACCES;
+ }
+
+ if (!p.is_replicated() &&
+ (var == "read_ratio")) {
+ return -EACCES;
+ }
+
+ if (var == "size") {
+ if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
+ ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
+ return -EPERM;
+ }
+ if (p.type == pg_pool_t::TYPE_ERASURE) {
+ ss << "can not change the size of an erasure-coded pool";
+ return -ENOTSUP;
+ }
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n <= 0 || n > 10) {
+ ss << "pool size must be between 1 and 10";
+ return -EINVAL;
+ }
+ if (n == 1) {
+ if (!g_conf().get_val<bool>("mon_allow_pool_size_one")) {
+ ss << "configuring pool size as 1 is disabled by default.";
+ return -EPERM;
+ }
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) { ss << "WARNING: setting pool size 1 could lead to data loss "
+ "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
+ "pass the flag --yes-i-really-mean-it.";
+ return -EPERM;
+ }
+ }
+ if (!osdmap.crush->rule_valid_for_pool_type(p.get_crush_rule(), p.type)) {
+ ss << "crush rule " << p.get_crush_rule() << " type does not match pool";
+ return -EINVAL;
+ }
+ if (n > p.size) {
+ // only when increasing pool size
+ int r = check_pg_num(pool, p.get_pg_num(), n, p.get_crush_rule(), &ss);
+ if (r < 0) {
+ return r;
+ }
+ }
+ p.size = n;
+ p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
+ } else if (var == "min_size") {
+ if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
+ ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
+ return -EPERM;
+ }
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+
+ if (p.type != pg_pool_t::TYPE_ERASURE) {
+ if (n < 1 || n > p.size) {
+ ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
+ return -EINVAL;
+ }
+ } else {
+ ErasureCodeInterfaceRef erasure_code;
+ int k;
+ stringstream tmp;
+ int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
+ if (err == 0) {
+ k = erasure_code->get_data_chunk_count();
+ } else {
+ ss << __func__ << " get_erasure_code failed: " << tmp.str();
+ return err;
+ }
+
+ if (n < k || n > p.size) {
+ ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
+ return -EINVAL;
+ }
+ }
+ p.min_size = n;
+ } else if (var == "pg_num_actual") {
+ if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
+ ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
+ return -EPERM;
+ }
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n == (int)p.get_pg_num()) {
+ return 0;
+ }
+ if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
+ ss << "'pg_num' must be greater than 0 and less than or equal to "
+ << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
+ << " (you may adjust 'mon max pool pg num' for higher values)";
+ return -ERANGE;
+ }
+ if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
+ ss << "cannot adjust pg_num while initial PGs are being created";
+ return -EBUSY;
+ }
+ if (n > (int)p.get_pg_num()) {
+ if (p.get_pg_num() != p.get_pg_num_pending()) {
+ // force pre-nautilus clients to resend their ops, since they
+ // don't understand pg_num_pending changes form a new interval
+ p.last_force_op_resend_prenautilus = pending_inc.epoch;
+ }
+ p.set_pg_num(n);
+ } else {
+ if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+ ss << "nautilus OSDs are required to adjust pg_num_pending";
+ return -EPERM;
+ }
+ if (n < (int)p.get_pgp_num()) {
+ ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
+ return -EINVAL;
+ }
+ if (n < (int)p.get_pg_num() - 1) {
+ ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
+ << ") - 1; only single pg decrease is currently supported";
+ return -EINVAL;
+ }
+ p.set_pg_num_pending(n);
+ // force pre-nautilus clients to resend their ops, since they
+ // don't understand pg_num_pending changes form a new interval
+ p.last_force_op_resend_prenautilus = pending_inc.epoch;
+ }
+ // force pre-luminous clients to resend their ops, since they
+ // don't understand that split PGs now form a new interval.
+ p.last_force_op_resend_preluminous = pending_inc.epoch;
+ } else if (var == "pg_num") {
+ if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
+ ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
+ return -EPERM;
+ }
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n == (int)p.get_pg_num_target()) {
+ return 0;
+ }
+ if (n <= 0 || static_cast<uint64_t>(n) >
+ g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
+ ss << "'pg_num' must be greater than 0 and less than or equal to "
+ << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
+ << " (you may adjust 'mon max pool pg num' for higher values)";
+ return -ERANGE;
+ }
+ if (n > (int)p.get_pg_num_target()) {
+ int r = check_pg_num(pool, n, p.get_size(), p.get_crush_rule(), &ss);
+ if (r) {
+ return r;
+ }
+ bool force = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", force);
+ if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
+ ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
+ return -EPERM;
+ }
+ } else {
+ if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+ ss << "nautilus OSDs are required to decrease pg_num";
+ return -EPERM;
+ }
+ }
+ int64_t pg_min = 0, pg_max = 0;
+ p.opts.get(pool_opts_t::PG_NUM_MIN, &pg_min);
+ p.opts.get(pool_opts_t::PG_NUM_MAX, &pg_max);
+ if (pg_min && n < pg_min) {
+ ss << "specified pg_num " << n
+ << " < pg_num_min " << pg_min;
+ return -EINVAL;
+ }
+ if (pg_max && n > pg_max) {
+ ss << "specified pg_num " << n
+ << " < pg_num_max " << pg_max;
+ return -EINVAL;
+ }
+ if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+ // pre-nautilus osdmap format; increase pg_num directly
+ assert(n > (int)p.get_pg_num());
+ // force pre-nautilus clients to resend their ops, since they
+ // don't understand pg_num_target changes form a new interval
+ p.last_force_op_resend_prenautilus = pending_inc.epoch;
+ // force pre-luminous clients to resend their ops, since they
+ // don't understand that split PGs now form a new interval.
+ p.last_force_op_resend_preluminous = pending_inc.epoch;
+ p.set_pg_num(n);
+ } else {
+ // set targets; mgr will adjust pg_num_actual and pgp_num later.
+ // make pgp_num track pg_num if it already matches. if it is set
+ // differently, leave it different and let the user control it
+ // manually.
+ if (p.get_pg_num_target() == p.get_pgp_num_target()) {
+ p.set_pgp_num_target(n);
+ }
+ p.set_pg_num_target(n);
+ }
+ } else if (var == "pgp_num_actual") {
+ if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
+ ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
+ return -EPERM;
+ }
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n <= 0) {
+ ss << "specified pgp_num must > 0, but you set to " << n;
+ return -EINVAL;
+ }
+ if (n > (int)p.get_pg_num()) {
+ ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
+ return -EINVAL;
+ }
+ if (n > (int)p.get_pg_num_pending()) {
+ ss << "specified pgp_num " << n
+ << " > pg_num_pending " << p.get_pg_num_pending();
+ return -EINVAL;
+ }
+ p.set_pgp_num(n);
+ } else if (var == "pgp_num") {
+ if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
+ ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
+ return -EPERM;
+ }
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n <= 0) {
+ ss << "specified pgp_num must > 0, but you set to " << n;
+ return -EINVAL;
+ }
+ if (n > (int)p.get_pg_num_target()) {
+ ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
+ return -EINVAL;
+ }
+ if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+ // pre-nautilus osdmap format; increase pgp_num directly
+ p.set_pgp_num(n);
+ } else {
+ p.set_pgp_num_target(n);
+ }
+ } else if (var == "pg_autoscale_mode") {
+ auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
+ if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
+ ss << "specified invalid mode " << val;
+ return -EINVAL;
+ }
+ if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+ ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
+ return -EINVAL;
+ }
+ p.pg_autoscale_mode = m;
+ } else if (var == "crush_rule") {
+ int id = osdmap.crush->get_rule_id(val);
+ if (id == -ENOENT) {
+ ss << "crush rule " << val << " does not exist";
+ return -ENOENT;
+ }
+ if (id < 0) {
+ ss << cpp_strerror(id);
+ return -ENOENT;
+ }
+ if (!osdmap.crush->rule_valid_for_pool_type(id, p.get_type())) {
+ ss << "crush rule " << id << " type does not match pool";
+ return -EINVAL;
+ }
+ p.crush_rule = id;
+ } else if (var == "nodelete" || var == "nopgchange" ||
+ var == "nosizechange" || var == "write_fadvise_dontneed" ||
+ var == "noscrub" || var == "nodeep-scrub" || var == "bulk") {
+ uint64_t flag = pg_pool_t::get_flag_by_name(var);
+ // make sure we only compare against 'n' if we didn't receive a string
+ if (val == "true" || (interr.empty() && n == 1)) {
+ p.set_flag(flag);
+ } else if (val == "false" || (interr.empty() && n == 0)) {
+ if (flag == pg_pool_t::FLAG_NOPGCHANGE && p.is_crimson()) {
+ ss << "cannot clear FLAG_NOPGCHANGE on a crimson pool";
+ return -EINVAL;
+ }
+ p.unset_flag(flag);
+ } else {
+ ss << "expecting value 'true', 'false', '0', or '1'";
+ return -EINVAL;
+ }
+ } else if (var == "eio") {
+ uint64_t flag = pg_pool_t::get_flag_by_name(var);
+
+ // make sure we only compare against 'n' if we didn't receive a string
+ if (val == "true" || (interr.empty() && n == 1)) {
+ p.set_flag(flag);
+ } else if (val == "false" || (interr.empty() && n == 0)) {
+ p.unset_flag(flag);
+ } else {
+ ss << "expecting value 'true', 'false', '0', or '1'";
+ return -EINVAL;
+ }
+ } else if (var == "hashpspool") {
+ uint64_t flag = pg_pool_t::get_flag_by_name(var);
+ bool force = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", force);
+
+ if (!force) {
+ ss << "are you SURE? this will remap all placement groups in this pool,"
+ " this triggers large data movement,"
+ " pass --yes-i-really-mean-it if you really do.";
+ return -EPERM;
+ }
+ // make sure we only compare against 'n' if we didn't receive a string
+ if (val == "true" || (interr.empty() && n == 1)) {
+ p.set_flag(flag);
+ } else if (val == "false" || (interr.empty() && n == 0)) {
+ p.unset_flag(flag);
+ } else {
+ ss << "expecting value 'true', 'false', '0', or '1'";
+ return -EINVAL;
+ }
+ } else if (var == "hit_set_type") {
+ if (val == "none")
+ p.hit_set_params = HitSet::Params();
+ else {
+ int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
+ if (err)
+ return err;
+ if (val == "bloom") {
+ BloomHitSet::Params *bsp = new BloomHitSet::Params;
+ bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
+ p.hit_set_params = HitSet::Params(bsp);
+ } else if (val == "explicit_hash")
+ p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
+ else if (val == "explicit_object")
+ p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
+ else {
+ ss << "unrecognized hit_set type '" << val << "'";
+ return -EINVAL;
+ }
+ }
+ } else if (var == "hit_set_period") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ } else if (n < 0) {
+ ss << "hit_set_period should be non-negative";
+ return -EINVAL;
+ }
+ p.hit_set_period = n;
+ } else if (var == "hit_set_count") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ } else if (n < 0) {
+ ss << "hit_set_count should be non-negative";
+ return -EINVAL;
+ }
+ p.hit_set_count = n;
+ } else if (var == "hit_set_fpp") {
+ if (floaterr.length()) {
+ ss << "error parsing floating point value '" << val << "': " << floaterr;
+ return -EINVAL;
+ } else if (f < 0 || f > 1.0) {
+ ss << "hit_set_fpp should be in the range 0..1";
+ return -EINVAL;
+ }
+ if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
+ ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
+ return -EINVAL;
+ }
+ BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
+ bloomp->set_fpp(f);
+ } else if (var == "use_gmt_hitset") {
+ if (val == "true" || (interr.empty() && n == 1)) {
+ p.use_gmt_hitset = true;
+ } else {
+ ss << "expecting value 'true' or '1'";
+ return -EINVAL;
+ }
+ } else if (var == "allow_ec_overwrites") {
+ if (!p.is_erasure()) {
+ ss << "ec overwrites can only be enabled for an erasure coded pool";
+ return -EINVAL;
+ }
+ stringstream err;
+ if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
+ !is_pool_currently_all_bluestore(pool, p, &err)) {
+ ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
+ return -EINVAL;
+ }
+ if (val == "true" || (interr.empty() && n == 1)) {
+ p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
+ } else if (val == "false" || (interr.empty() && n == 0)) {
+ ss << "ec overwrites cannot be disabled once enabled";
+ return -EINVAL;
+ } else {
+ ss << "expecting value 'true', 'false', '0', or '1'";
+ return -EINVAL;
+ }
+ } else if (var == "target_max_objects") {
+ if (interr.length()) {
+ ss << "error parsing int '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ p.target_max_objects = n;
+ } else if (var == "target_max_bytes") {
+ if (interr.length()) {
+ ss << "error parsing int '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ p.target_max_bytes = n;
+ } else if (var == "cache_target_dirty_ratio") {
+ if (floaterr.length()) {
+ ss << "error parsing float '" << val << "': " << floaterr;
+ return -EINVAL;
+ }
+ if (f < 0 || f > 1.0) {
+ ss << "value must be in the range 0..1";
+ return -ERANGE;
+ }
+ p.cache_target_dirty_ratio_micro = uf;
+ } else if (var == "cache_target_dirty_high_ratio") {
+ if (floaterr.length()) {
+ ss << "error parsing float '" << val << "': " << floaterr;
+ return -EINVAL;
+ }
+ if (f < 0 || f > 1.0) {
+ ss << "value must be in the range 0..1";
+ return -ERANGE;
+ }
+ p.cache_target_dirty_high_ratio_micro = uf;
+ } else if (var == "cache_target_full_ratio") {
+ if (floaterr.length()) {
+ ss << "error parsing float '" << val << "': " << floaterr;
+ return -EINVAL;
+ }
+ if (f < 0 || f > 1.0) {
+ ss << "value must be in the range 0..1";
+ return -ERANGE;
+ }
+ p.cache_target_full_ratio_micro = uf;
+ } else if (var == "cache_min_flush_age") {
+ if (interr.length()) {
+ ss << "error parsing int '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ p.cache_min_flush_age = n;
+ } else if (var == "cache_min_evict_age") {
+ if (interr.length()) {
+ ss << "error parsing int '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ p.cache_min_evict_age = n;
+ } else if (var == "min_read_recency_for_promote") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ p.min_read_recency_for_promote = n;
+ } else if (var == "hit_set_grade_decay_rate") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n > 100 || n < 0) {
+ ss << "value out of range,valid range is 0 - 100";
+ return -EINVAL;
+ }
+ p.hit_set_grade_decay_rate = n;
+ } else if (var == "hit_set_search_last_n") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n > p.hit_set_count || n < 0) {
+ ss << "value out of range,valid range is 0 - hit_set_count";
+ return -EINVAL;
+ }
+ p.hit_set_search_last_n = n;
+ } else if (var == "min_write_recency_for_promote") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ p.min_write_recency_for_promote = n;
+ } else if (var == "fast_read") {
+ if (p.is_replicated()) {
+ ss << "fast read is not supported in replication pool";
+ return -EINVAL;
+ }
+ if (val == "true" || (interr.empty() && n == 1)) {
+ p.fast_read = true;
+ } else if (val == "false" || (interr.empty() && n == 0)) {
+ p.fast_read = false;
+ } else {
+ ss << "expecting value 'true', 'false', '0', or '1'";
+ return -EINVAL;
+ }
+ } else if (pool_opts_t::is_opt_name(var)) {
+ bool unset = val == "unset";
+ if (var == "compression_mode") {
+ if (!unset) {
+ auto cmode = Compressor::get_comp_mode_type(val);
+ if (!cmode) {
+ ss << "unrecognized compression mode '" << val << "'";
+ return -EINVAL;
+ }
+ }
+ } else if (var == "compression_algorithm") {
+ if (!unset) {
+ auto alg = Compressor::get_comp_alg_type(val);
+ if (!alg) {
+ ss << "unrecognized compression_algorithm '" << val << "'";
+ return -EINVAL;
+ }
+ }
+ } else if (var == "compression_required_ratio") {
+ if (floaterr.length()) {
+ ss << "error parsing float value '" << val << "': " << floaterr;
+ return -EINVAL;
+ }
+ if (f < 0 || f > 1) {
+ ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
+ return -EINVAL;
+ }
+ } else if (var == "csum_type") {
+ auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
+ if (t < 0 ) {
+ ss << "unrecognized csum_type '" << val << "'";
+ return -EINVAL;
+ }
+ //preserve csum_type numeric value
+ n = t;
+ interr.clear();
+ } else if (var == "compression_max_blob_size" ||
+ var == "compression_min_blob_size" ||
+ var == "csum_max_block" ||
+ var == "csum_min_block") {
+ if (interr.length()) {
+ ss << "error parsing int value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ } else if (var == "fingerprint_algorithm") {
+ if (!unset) {
+ auto alg = pg_pool_t::get_fingerprint_from_str(val);
+ if (!alg) {
+ ss << "unrecognized fingerprint_algorithm '" << val << "'";
+ return -EINVAL;
+ }
+ }
+ } else if (var == "target_size_bytes") {
+ if (interr.length()) {
+ ss << "error parsing unit value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+ ss << "must set require_osd_release to nautilus or "
+ << "later before setting target_size_bytes";
+ return -EINVAL;
+ }
+ } else if (var == "target_size_ratio") {
+ if (f < 0.0) {
+ ss << "target_size_ratio cannot be negative";
+ return -EINVAL;
+ }
+ } else if (var == "pg_num_min") {
+ if (interr.length()) {
+ ss << "error parsing int value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n > (int)p.get_pg_num_target()) {
+ ss << "specified pg_num_min " << n
+ << " > pg_num " << p.get_pg_num_target();
+ return -EINVAL;
+ }
+ } else if (var == "pg_num_max") {
+ if (interr.length()) {
+ ss << "error parsing int value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n && n < (int)p.get_pg_num_target()) {
+ ss << "specified pg_num_max " << n
+ << " < pg_num " << p.get_pg_num_target();
+ return -EINVAL;
+ }
+ } else if (var == "recovery_priority") {
+ if (interr.length()) {
+ ss << "error parsing int value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (!g_conf()->debug_allow_any_pool_priority) {
+ if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
+ ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
+ << " and " << OSD_POOL_PRIORITY_MAX;
+ return -EINVAL;
+ }
+ }
+ } else if (var == "pg_autoscale_bias") {
+ if (f < 0.0 || f > 1000.0) {
+ ss << "pg_autoscale_bias must be between 0 and 1000";
+ return -EINVAL;
+ }
+ } else if (var == "dedup_tier") {
+ if (interr.empty()) {
+ ss << "expecting value 'pool name'";
+ return -EINVAL;
+ }
+ // Current base tier in dedup does not support ec pool
+ if (p.is_erasure()) {
+ ss << "pool '" << poolstr
+ << "' is an ec pool, which cannot be a base tier";
+ return -ENOTSUP;
+ }
+ int64_t lowtierpool_id = osdmap.lookup_pg_pool_name(val);
+ if (lowtierpool_id < 0) {
+ ss << "unrecognized pool '" << val << "'";
+ return -ENOENT;
+ }
+ const pg_pool_t *tp = osdmap.get_pg_pool(lowtierpool_id);
+ ceph_assert(tp);
+ n = lowtierpool_id;
+ // The original input is string (pool name), but we convert it to int64_t.
+ // So, clear interr
+ interr.clear();
+ } else if (var == "dedup_chunk_algorithm") {
+ if (!unset) {
+ auto alg = pg_pool_t::get_dedup_chunk_algorithm_from_str(val);
+ if (!alg) {
+ ss << "unrecognized fingerprint_algorithm '" << val << "'";
+ return -EINVAL;
+ }
+ }
+ } else if (var == "dedup_cdc_chunk_size") {
+ if (interr.length()) {
+ ss << "error parsing int value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ } else if (var == "read_ratio") {
+ if (interr.length()) {
+ ss << "error parsing int value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n < 0 || n > 100) {
+ ss << "read_ratio must be between 0 and 100";
+ return -ERANGE;
+ }
+ }
+
+ pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
+ switch (desc.type) {
+ case pool_opts_t::STR:
+ if (unset) {
+ p.opts.unset(desc.key);
+ } else {
+ p.opts.set(desc.key, static_cast<std::string>(val));
+ }
+ break;
+ case pool_opts_t::INT:
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n == 0) {
+ p.opts.unset(desc.key);
+ } else {
+ p.opts.set(desc.key, static_cast<int64_t>(n));
+ }
+ break;
+ case pool_opts_t::DOUBLE:
+ if (floaterr.length()) {
+ ss << "error parsing floating point value '" << val << "': " << floaterr;
+ return -EINVAL;
+ }
+ if (f == 0) {
+ p.opts.unset(desc.key);
+ } else {
+ p.opts.set(desc.key, static_cast<double>(f));
+ }
+ break;
+ default:
+ ceph_assert(!"unknown type");
+ }
+ } else {
+ ss << "unrecognized variable '" << var << "'";
+ return -EINVAL;
+ }
+ if (val != "unset") {
+ ss << "set pool " << pool << " " << var << " to " << val;
+ } else {
+ ss << "unset pool " << pool << " " << var;
+ }
+ p.last_change = pending_inc.epoch;
+ pending_inc.new_pools[pool] = p;
+ return 0;
+}
+
+int OSDMonitor::prepare_command_pool_application(const string &prefix,
+ const cmdmap_t& cmdmap,
+ stringstream& ss)
+{
+ return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
+}
+
+int OSDMonitor::preprocess_command_pool_application(const string &prefix,
+ const cmdmap_t& cmdmap,
+ stringstream& ss,
+ bool *modified)
+{
+ return _command_pool_application(prefix, cmdmap, ss, modified, false);
+}
+
+
+/**
+ * Common logic for preprocess and prepare phases of pool application
+ * tag commands. In preprocess mode we're only detecting invalid
+ * commands, and determining whether it was a modification or a no-op.
+ * In prepare mode we're actually updating the pending state.
+ */
+int OSDMonitor::_command_pool_application(const string &prefix,
+ const cmdmap_t& cmdmap,
+ stringstream& ss,
+ bool *modified,
+ bool preparing)
+{
+ string pool_name;
+ cmd_getval(cmdmap, "pool", pool_name);
+ int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
+ if (pool < 0) {
+ ss << "unrecognized pool '" << pool_name << "'";
+ return -ENOENT;
+ }
+
+ pg_pool_t p = *osdmap.get_pg_pool(pool);
+ if (preparing) {
+ if (pending_inc.new_pools.count(pool)) {
+ p = pending_inc.new_pools[pool];
+ }
+ }
+
+ string app;
+ cmd_getval(cmdmap, "app", app);
+ bool app_exists = (p.application_metadata.count(app) > 0);
+
+ string key;
+ cmd_getval(cmdmap, "key", key);
+ if (key == "all") {
+ ss << "key cannot be 'all'";
+ return -EINVAL;
+ }
+
+ string value;
+ cmd_getval(cmdmap, "value", value);
+ if (value == "all") {
+ ss << "value cannot be 'all'";
+ return -EINVAL;
+ }
+
+ if (boost::algorithm::ends_with(prefix, "enable")) {
+ if (app.empty()) {
+ ss << "application name must be provided";
+ return -EINVAL;
+ }
+
+ if (p.is_tier()) {
+ ss << "application must be enabled on base tier";
+ return -EINVAL;
+ }
+
+ bool force = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", force);
+
+ if (!app_exists && !p.application_metadata.empty() && !force) {
+ ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
+ << "application; pass --yes-i-really-mean-it to proceed anyway";
+ return -EPERM;
+ }
+
+ if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
+ ss << "too many enabled applications on pool '" << pool_name << "'; "
+ << "max " << MAX_POOL_APPLICATIONS;
+ return -EINVAL;
+ }
+
+ if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
+ ss << "application name '" << app << "' too long; max length "
+ << MAX_POOL_APPLICATION_LENGTH;
+ return -EINVAL;
+ }
+
+ if (!app_exists) {
+ p.application_metadata[app] = {};
+ }
+ ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
+
+ } else if (boost::algorithm::ends_with(prefix, "disable")) {
+ bool force = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", force);
+
+ if (!force) {
+ ss << "Are you SURE? Disabling an application within a pool might result "
+ << "in loss of application functionality; pass "
+ << "--yes-i-really-mean-it to proceed anyway";
+ return -EPERM;
+ }
+
+ if (!app_exists) {
+ ss << "application '" << app << "' is not enabled on pool '" << pool_name
+ << "'";
+ return 0; // idempotent
+ }
+
+ p.application_metadata.erase(app);
+ ss << "disable application '" << app << "' on pool '" << pool_name << "'";
+
+ } else if (boost::algorithm::ends_with(prefix, "set")) {
+ if (p.is_tier()) {
+ ss << "application metadata must be set on base tier";
+ return -EINVAL;
+ }
+
+ if (!app_exists) {
+ ss << "application '" << app << "' is not enabled on pool '" << pool_name
+ << "'";
+ return -ENOENT;
+ }
+
+ string key;
+ cmd_getval(cmdmap, "key", key);
+
+ if (key.empty()) {
+ ss << "key must be provided";
+ return -EINVAL;
+ }
+
+ auto &app_keys = p.application_metadata[app];
+ if (app_keys.count(key) == 0 &&
+ app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
+ ss << "too many keys set for application '" << app << "' on pool '"
+ << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
+ return -EINVAL;
+ }
+
+ if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
+ ss << "key '" << app << "' too long; max length "
+ << MAX_POOL_APPLICATION_LENGTH;
+ return -EINVAL;
+ }
+
+ string value;
+ cmd_getval(cmdmap, "value", value);
+ if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
+ ss << "value '" << value << "' too long; max length "
+ << MAX_POOL_APPLICATION_LENGTH;
+ return -EINVAL;
+ }
+
+ p.application_metadata[app][key] = value;
+ ss << "set application '" << app << "' key '" << key << "' to '"
+ << value << "' on pool '" << pool_name << "'";
+ } else if (boost::algorithm::ends_with(prefix, "rm")) {
+ if (!app_exists) {
+ ss << "application '" << app << "' is not enabled on pool '" << pool_name
+ << "'";
+ return -ENOENT;
+ }
+
+ string key;
+ cmd_getval(cmdmap, "key", key);
+ auto it = p.application_metadata[app].find(key);
+ if (it == p.application_metadata[app].end()) {
+ ss << "application '" << app << "' on pool '" << pool_name
+ << "' does not have key '" << key << "'";
+ return 0; // idempotent
+ }
+
+ p.application_metadata[app].erase(it);
+ ss << "removed application '" << app << "' key '" << key << "' on pool '"
+ << pool_name << "'";
+ } else {
+ ceph_abort();
+ }
+
+ if (preparing) {
+ p.last_change = pending_inc.epoch;
+ pending_inc.new_pools[pool] = p;
+ }
+
+ // Because we fell through this far, we didn't hit no-op cases,
+ // so pool was definitely modified
+ if (modified != nullptr) {
+ *modified = true;
+ }
+
+ return 0;
+}
+
+int OSDMonitor::_prepare_command_osd_crush_remove(
+ CrushWrapper &newcrush,
+ int32_t id,
+ int32_t ancestor,
+ bool has_ancestor,
+ bool unlink_only)
+{
+ int err = 0;
+
+ if (has_ancestor) {
+ err = newcrush.remove_item_under(cct, id, ancestor,
+ unlink_only);
+ } else {
+ err = newcrush.remove_item(cct, id, unlink_only);
+ }
+ return err;
+}
+
+void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
+{
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+}
+
+int OSDMonitor::prepare_command_osd_crush_remove(
+ CrushWrapper &newcrush,
+ int32_t id,
+ int32_t ancestor,
+ bool has_ancestor,
+ bool unlink_only)
+{
+ int err = _prepare_command_osd_crush_remove(
+ newcrush, id, ancestor,
+ has_ancestor, unlink_only);
+
+ if (err < 0)
+ return err;
+
+ ceph_assert(err == 0);
+ do_osd_crush_remove(newcrush);
+
+ return 0;
+}
+
+int OSDMonitor::prepare_command_osd_remove(int32_t id)
+{
+ if (osdmap.is_up(id)) {
+ return -EBUSY;
+ }
+
+ pending_inc.new_state[id] = osdmap.get_state(id);
+ pending_inc.new_uuid[id] = uuid_d();
+ pending_metadata_rm.insert(id);
+ pending_metadata.erase(id);
+
+ return 0;
+}
+
+int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
+{
+ ceph_assert(existing_id);
+ *existing_id = -1;
+
+ for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
+ if (!osdmap.exists(i) &&
+ pending_inc.new_up_client.count(i) == 0 &&
+ (pending_inc.new_state.count(i) == 0 ||
+ (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
+ *existing_id = i;
+ return -1;
+ }
+ }
+
+ if (pending_inc.new_max_osd < 0) {
+ return osdmap.get_max_osd();
+ }
+ return pending_inc.new_max_osd;
+}
+
+void OSDMonitor::do_osd_create(
+ const int32_t id,
+ const uuid_d& uuid,
+ const string& device_class,
+ int32_t* new_id)
+{
+ dout(10) << __func__ << " uuid " << uuid << dendl;
+ ceph_assert(new_id);
+
+ // We presume validation has been performed prior to calling this
+ // function. We assert with prejudice.
+
+ int32_t allocated_id = -1; // declare here so we can jump
+ int32_t existing_id = -1;
+ if (!uuid.is_zero()) {
+ existing_id = osdmap.identify_osd(uuid);
+ if (existing_id >= 0) {
+ ceph_assert(id < 0 || id == existing_id);
+ *new_id = existing_id;
+ goto out;
+ } else if (id >= 0) {
+ // uuid does not exist, and id has been provided, so just create
+ // the new osd.id
+ *new_id = id;
+ goto out;
+ }
+ }
+
+ // allocate a new id
+ allocated_id = _allocate_osd_id(&existing_id);
+ dout(10) << __func__ << " allocated id " << allocated_id
+ << " existing id " << existing_id << dendl;
+ if (existing_id >= 0) {
+ ceph_assert(existing_id < osdmap.get_max_osd());
+ ceph_assert(allocated_id < 0);
+ *new_id = existing_id;
+ } else if (allocated_id >= 0) {
+ ceph_assert(existing_id < 0);
+ // raise max_osd
+ if (pending_inc.new_max_osd < 0) {
+ pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
+ } else {
+ ++pending_inc.new_max_osd;
+ }
+ *new_id = pending_inc.new_max_osd - 1;
+ ceph_assert(*new_id == allocated_id);
+ } else {
+ ceph_abort_msg("unexpected condition");
+ }
+
+out:
+ if (device_class.size()) {
+ CrushWrapper newcrush = _get_pending_crush();
+ if (newcrush.get_max_devices() < *new_id + 1) {
+ newcrush.set_max_devices(*new_id + 1);
+ }
+ string name = string("osd.") + stringify(*new_id);
+ if (!newcrush.item_exists(*new_id)) {
+ newcrush.set_item_name(*new_id, name);
+ }
+ ostringstream ss;
+ int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
+ if (r < 0) {
+ derr << __func__ << " failed to set " << name << " device_class "
+ << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
+ << dendl;
+ // non-fatal... this might be a replay and we want to be idempotent.
+ } else {
+ dout(20) << __func__ << " set " << name << " device_class " << device_class
+ << dendl;
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ }
+ } else {
+ dout(20) << __func__ << " no device_class" << dendl;
+ }
+
+ dout(10) << __func__ << " using id " << *new_id << dendl;
+ if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
+ pending_inc.new_max_osd = *new_id + 1;
+ }
+
+ pending_inc.new_weight[*new_id] = CEPH_OSD_IN;
+ // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
+ // set it for us. (ugh.)
+ pending_inc.new_state[*new_id] |= CEPH_OSD_NEW;
+ if (!uuid.is_zero())
+ pending_inc.new_uuid[*new_id] = uuid;
+}
+
+int OSDMonitor::validate_osd_create(
+ const int32_t id,
+ const uuid_d& uuid,
+ const bool check_osd_exists,
+ int32_t* existing_id,
+ stringstream& ss)
+{
+
+ dout(10) << __func__ << " id " << id << " uuid " << uuid
+ << " check_osd_exists " << check_osd_exists << dendl;
+
+ ceph_assert(existing_id);
+
+ if (id < 0 && uuid.is_zero()) {
+ // we have nothing to validate
+ *existing_id = -1;
+ return 0;
+ } else if (uuid.is_zero()) {
+ // we have an id but we will ignore it - because that's what
+ // `osd create` does.
+ return 0;
+ }
+
+ /*
+ * This function will be used to validate whether we are able to
+ * create a new osd when the `uuid` is specified.
+ *
+ * It will be used by both `osd create` and `osd new`, as the checks
+ * are basically the same when it pertains to osd id and uuid validation.
+ * However, `osd create` presumes an `uuid` is optional, for legacy
+ * reasons, while `osd new` requires the `uuid` to be provided. This
+ * means that `osd create` will not be idempotent if an `uuid` is not
+ * provided, but we will always guarantee the idempotency of `osd new`.
+ */
+
+ ceph_assert(!uuid.is_zero());
+ if (pending_inc.identify_osd(uuid) >= 0) {
+ // osd is about to exist
+ return -EAGAIN;
+ }
+
+ int32_t i = osdmap.identify_osd(uuid);
+ if (i >= 0) {
+ // osd already exists
+ if (id >= 0 && i != id) {
+ ss << "uuid " << uuid << " already in use for different id " << i;
+ return -EEXIST;
+ }
+ // return a positive errno to distinguish between a blocking error
+ // and an error we consider to not be a problem (i.e., this would be
+ // an idempotent operation).
+ *existing_id = i;
+ return EEXIST;
+ }
+ // i < 0
+ if (id >= 0) {
+ if (pending_inc.new_state.count(id)) {
+ // osd is about to exist
+ return -EAGAIN;
+ }
+ // we may not care if an osd exists if we are recreating a previously
+ // destroyed osd.
+ if (check_osd_exists && osdmap.exists(id)) {
+ ss << "id " << id << " already in use and does not match uuid "
+ << uuid;
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+int OSDMonitor::prepare_command_osd_create(
+ const int32_t id,
+ const uuid_d& uuid,
+ int32_t* existing_id,
+ stringstream& ss)
+{
+ dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
+ ceph_assert(existing_id);
+ if (osdmap.is_destroyed(id)) {
+ ss << "ceph osd create has been deprecated. Please use ceph osd new "
+ "instead.";
+ return -EINVAL;
+ }
+
+ if (uuid.is_zero()) {
+ dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
+ }
+
+ return validate_osd_create(id, uuid, true, existing_id, ss);
+}
+
+int OSDMonitor::prepare_command_osd_new(
+ MonOpRequestRef op,
+ const cmdmap_t& cmdmap,
+ const map<string,string>& params,
+ stringstream &ss,
+ Formatter *f)
+{
+ uuid_d uuid;
+ string uuidstr;
+ int64_t id = -1;
+
+ ceph_assert(paxos.is_plugged());
+
+ dout(10) << __func__ << " " << op << dendl;
+
+ /* validate command. abort now if something's wrong. */
+
+ /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
+ *
+ * If `id` is not specified, we will identify any existing osd based
+ * on `uuid`. Operation will be idempotent iff secrets match.
+ *
+ * If `id` is specified, we will identify any existing osd based on
+ * `uuid` and match against `id`. If they match, operation will be
+ * idempotent iff secrets match.
+ *
+ * `-i secrets.json` will be optional. If supplied, will be used
+ * to check for idempotency when `id` and `uuid` match.
+ *
+ * If `id` is not specified, and `uuid` does not exist, an id will
+ * be found or allocated for the osd.
+ *
+ * If `id` is specified, and the osd has been previously marked
+ * as destroyed, then the `id` will be reused.
+ */
+ if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
+ ss << "requires the OSD's UUID to be specified.";
+ return -EINVAL;
+ } else if (!uuid.parse(uuidstr.c_str())) {
+ ss << "invalid UUID value '" << uuidstr << "'.";
+ return -EINVAL;
+ }
+
+ if (cmd_getval(cmdmap, "id", id) &&
+ (id < 0)) {
+ ss << "invalid OSD id; must be greater or equal than zero.";
+ return -EINVAL;
+ }
+
+ // are we running an `osd create`-like command, or recreating
+ // a previously destroyed osd?
+
+ bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
+
+ // we will care about `id` to assess whether osd is `destroyed`, or
+ // to create a new osd.
+ // we will need an `id` by the time we reach auth.
+
+ int32_t existing_id = -1;
+ int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
+ &existing_id, ss);
+
+ bool may_be_idempotent = false;
+ if (err == EEXIST) {
+ // this is idempotent from the osdmon's point-of-view
+ may_be_idempotent = true;
+ ceph_assert(existing_id >= 0);
+ id = existing_id;
+ } else if (err < 0) {
+ return err;
+ }
+
+ if (!may_be_idempotent) {
+ // idempotency is out of the window. We are either creating a new
+ // osd or recreating a destroyed osd.
+ //
+ // We now need to figure out if we have an `id` (and if it's valid),
+ // of find an `id` if we don't have one.
+
+ // NOTE: we need to consider the case where the `id` is specified for
+ // `osd create`, and we must honor it. So this means checking if
+ // the `id` is destroyed, and if so assume the destroy; otherwise,
+ // check if it `exists` - in which case we complain about not being
+ // `destroyed`. In the end, if nothing fails, we must allow the
+ // creation, so that we are compatible with `create`.
+ if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
+ dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
+ ss << "OSD " << id << " has not yet been destroyed";
+ return -EINVAL;
+ } else if (id < 0) {
+ // find an `id`
+ id = _allocate_osd_id(&existing_id);
+ if (id < 0) {
+ ceph_assert(existing_id >= 0);
+ id = existing_id;
+ }
+ dout(10) << __func__ << " found id " << id << " to use" << dendl;
+ } else if (id >= 0 && osdmap.is_destroyed(id)) {
+ dout(10) << __func__ << " recreating osd." << id << dendl;
+ } else {
+ dout(10) << __func__ << " creating new osd." << id << dendl;
+ }
+ } else {
+ ceph_assert(id >= 0);
+ ceph_assert(osdmap.exists(id));
+ }
+
+ // we are now able to either create a brand new osd or reuse an existing
+ // osd that has been previously destroyed.
+
+ dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
+
+ if (may_be_idempotent && params.empty()) {
+ // nothing to do, really.
+ dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
+ ceph_assert(id >= 0);
+ if (f) {
+ f->open_object_section("created_osd");
+ f->dump_int("osdid", id);
+ f->close_section();
+ } else {
+ ss << id;
+ }
+ return EEXIST;
+ }
+
+ string device_class;
+ auto p = params.find("crush_device_class");
+ if (p != params.end()) {
+ device_class = p->second;
+ dout(20) << __func__ << " device_class will be " << device_class << dendl;
+ }
+ string cephx_secret, lockbox_secret, dmcrypt_key;
+ bool has_lockbox = false;
+ bool has_secrets = params.count("cephx_secret")
+ || params.count("cephx_lockbox_secret")
+ || params.count("dmcrypt_key");
+
+ KVMonitor *svc = nullptr;
+ AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
+
+ if (has_secrets) {
+ if (params.count("cephx_secret") == 0) {
+ ss << "requires a cephx secret.";
+ return -EINVAL;
+ }
+ cephx_secret = params.at("cephx_secret");
+
+ bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
+ bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
+
+ dout(10) << __func__ << " has lockbox " << has_lockbox_secret
+ << " dmcrypt " << has_dmcrypt_key << dendl;
+
+ if (has_lockbox_secret && has_dmcrypt_key) {
+ has_lockbox = true;
+ lockbox_secret = params.at("cephx_lockbox_secret");
+ dmcrypt_key = params.at("dmcrypt_key");
+ } else if (!has_lockbox_secret != !has_dmcrypt_key) {
+ ss << "requires both a cephx lockbox secret and a dm-crypt key.";
+ return -EINVAL;
+ }
+
+ dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
+
+ err = mon.authmon()->validate_osd_new(id, uuid,
+ cephx_secret,
+ lockbox_secret,
+ cephx_entity,
+ lockbox_entity,
+ ss);
+ if (err < 0) {
+ return err;
+ } else if (may_be_idempotent && err != EEXIST) {
+ // for this to be idempotent, `id` should already be >= 0; no need
+ // to use validate_id.
+ ceph_assert(id >= 0);
+ ss << "osd." << id << " exists but secrets do not match";
+ return -EEXIST;
+ }
+
+ if (has_lockbox) {
+ svc = mon.kvmon();
+ err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
+ if (err < 0) {
+ return err;
+ } else if (may_be_idempotent && err != EEXIST) {
+ ceph_assert(id >= 0);
+ ss << "osd." << id << " exists but dm-crypt key does not match.";
+ return -EEXIST;
+ }
+ }
+ }
+ ceph_assert(!has_secrets || !cephx_secret.empty());
+ ceph_assert(!has_lockbox || !lockbox_secret.empty());
+
+ if (may_be_idempotent) {
+ // we have nothing to do for either the osdmon or the authmon,
+ // and we have no lockbox - so the config key service will not be
+ // touched. This is therefore an idempotent operation, and we can
+ // just return right away.
+ dout(10) << __func__ << " idempotent -- no op." << dendl;
+ ceph_assert(id >= 0);
+ if (f) {
+ f->open_object_section("created_osd");
+ f->dump_int("osdid", id);
+ f->close_section();
+ } else {
+ ss << id;
+ }
+ return EEXIST;
+ }
+ ceph_assert(!may_be_idempotent);
+
+ // perform updates.
+ if (has_secrets) {
+ ceph_assert(!cephx_secret.empty());
+ ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
+ (!lockbox_secret.empty() && !dmcrypt_key.empty()));
+
+ err = mon.authmon()->do_osd_new(cephx_entity,
+ lockbox_entity,
+ has_lockbox);
+ ceph_assert(0 == err);
+
+ if (has_lockbox) {
+ ceph_assert(nullptr != svc);
+ svc->do_osd_new(uuid, dmcrypt_key);
+ }
+ }
+
+ if (is_recreate_destroyed) {
+ ceph_assert(id >= 0);
+ ceph_assert(osdmap.is_destroyed(id));
+ pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
+ if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
+ pending_inc.new_state[id] |= CEPH_OSD_NEW;
+ }
+ if (osdmap.get_state(id) & CEPH_OSD_UP) {
+ // due to http://tracker.ceph.com/issues/20751 some clusters may
+ // have UP set for non-existent OSDs; make sure it is cleared
+ // for a newly created osd.
+ pending_inc.new_state[id] |= CEPH_OSD_UP;
+ }
+ pending_inc.new_uuid[id] = uuid;
+ } else {
+ ceph_assert(id >= 0);
+ int32_t new_id = -1;
+ do_osd_create(id, uuid, device_class, &new_id);
+ ceph_assert(new_id >= 0);
+ ceph_assert(id == new_id);
+ }
+
+ if (f) {
+ f->open_object_section("created_osd");
+ f->dump_int("osdid", id);
+ f->close_section();
+ } else {
+ ss << id;
+ }
+
+ return 0;
+}
+
+bool OSDMonitor::prepare_command(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MMonCommand>();
+ stringstream ss;
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, get_last_committed());
+ return false; /* nothing to propose */
+ }
+
+ MonSession *session = op->get_session();
+ if (!session) {
+ derr << __func__ << " no session" << dendl;
+ mon.reply_command(op, -EACCES, "access denied", get_last_committed());
+ return false; /* nothing to propose */
+ }
+
+ return prepare_command_impl(op, cmdmap);
+}
+
+static int parse_reweights(CephContext *cct,
+ const cmdmap_t& cmdmap,
+ const OSDMap& osdmap,
+ map<int32_t, uint32_t>* weights)
+{
+ string weights_str;
+ if (!cmd_getval(cmdmap, "weights", weights_str)) {
+ return -EINVAL;
+ }
+ std::replace(begin(weights_str), end(weights_str), '\'', '"');
+ json_spirit::mValue json_value;
+ if (!json_spirit::read(weights_str, json_value)) {
+ return -EINVAL;
+ }
+ if (json_value.type() != json_spirit::obj_type) {
+ return -EINVAL;
+ }
+ const auto obj = json_value.get_obj();
+ try {
+ for (auto& osd_weight : obj) {
+ auto osd_id = std::stoi(osd_weight.first);
+ if (!osdmap.exists(osd_id)) {
+ return -ENOENT;
+ }
+ if (osd_weight.second.type() != json_spirit::str_type) {
+ return -EINVAL;
+ }
+ auto weight = std::stoul(osd_weight.second.get_str());
+ weights->insert({osd_id, weight});
+ }
+ } catch (const std::logic_error& e) {
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int OSDMonitor::prepare_command_osd_destroy(
+ MonOpRequestRef op,
+ int32_t id,
+ stringstream& ss)
+{
+ ceph_assert(paxos.is_plugged());
+
+ // we check if the osd exists for the benefit of `osd purge`, which may
+ // have previously removed the osd. If the osd does not exist, return
+ // -ENOENT to convey this, and let the caller deal with it.
+ //
+ // we presume that all auth secrets and config keys were removed prior
+ // to this command being called. if they exist by now, we also assume
+ // they must have been created by some other command and do not pertain
+ // to this non-existent osd.
+ if (!osdmap.exists(id)) {
+ dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
+ return -ENOENT;
+ }
+
+ uuid_d uuid = osdmap.get_uuid(id);
+ dout(10) << __func__ << " destroying osd." << id
+ << " uuid " << uuid << dendl;
+
+ // if it has been destroyed, we assume our work here is done.
+ if (osdmap.is_destroyed(id)) {
+ ss << "destroyed osd." << id;
+ return 0;
+ }
+
+ EntityName cephx_entity, lockbox_entity;
+ bool idempotent_auth = false, idempotent_cks = false;
+
+ auto&& authmon = mon.authmon();
+ int err = authmon->validate_osd_destroy(id, uuid,
+ cephx_entity,
+ lockbox_entity,
+ ss);
+ if (err < 0) {
+ if (err == -ENOENT) {
+ idempotent_auth = true;
+ } else {
+ return err;
+ }
+ }
+
+ auto&& kvmon = mon.kvmon();
+ err = kvmon->validate_osd_destroy(id, uuid);
+ if (err < 0) {
+ ceph_assert(err == -ENOENT);
+ err = 0;
+ idempotent_cks = true;
+ }
+
+ if (!idempotent_auth && !authmon->is_writeable()) {
+ authmon->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return -EAGAIN;
+ }
+ if (!idempotent_cks && !kvmon->is_writeable()) {
+ kvmon->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return -EAGAIN;
+ }
+
+ if (!idempotent_auth) {
+ authmon->do_osd_destroy(cephx_entity, lockbox_entity);
+ }
+ if (!idempotent_cks) {
+ kvmon->do_osd_destroy(id, uuid);
+ }
+
+ pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
+ pending_inc.new_uuid[id] = uuid_d();
+
+ // we can only propose_pending() once per service, otherwise we'll be
+ // defying PaxosService and all laws of nature. Therefore, as we may
+ // be used during 'osd purge', let's keep the caller responsible for
+ // proposing.
+ ceph_assert(err == 0);
+ return 0;
+}
+
+int OSDMonitor::prepare_command_osd_purge(
+ MonOpRequestRef op,
+ int32_t id,
+ stringstream& ss)
+{
+ ceph_assert(paxos.is_plugged());
+ dout(10) << __func__ << " purging osd." << id << dendl;
+
+ ceph_assert(!osdmap.is_up(id));
+
+ /*
+ * This may look a bit weird, but this is what's going to happen:
+ *
+ * 1. we make sure that removing from crush works
+ * 2. we call `prepare_command_osd_destroy()`. If it returns an
+ * error, then we abort the whole operation, as no updates
+ * have been made. However, we this function will have
+ * side-effects, thus we need to make sure that all operations
+ * performed henceforth will *always* succeed.
+ * 3. we call `prepare_command_osd_remove()`. Although this
+ * function can return an error, it currently only checks if the
+ * osd is up - and we have made sure that it is not so, so there
+ * is no conflict, and it is effectively an update.
+ * 4. finally, we call `do_osd_crush_remove()`, which will perform
+ * the crush update we delayed from before.
+ */
+
+ CrushWrapper newcrush = _get_pending_crush();
+
+ bool may_be_idempotent = false;
+
+ int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
+ if (err == -ENOENT) {
+ err = 0;
+ may_be_idempotent = true;
+ } else if (err < 0) {
+ ss << "error removing osd." << id << " from crush";
+ return err;
+ }
+
+ // no point destroying the osd again if it has already been marked destroyed
+ if (!osdmap.is_destroyed(id)) {
+ /* N.B.: up to this point, we've not changed pending at all.
+ * ::prepare_command_osd_destroy may return -EAGAIN if the kvmon/authmon is
+ * not writeable without changing pending. It will queue `op` if we should wait.
+ */
+ err = prepare_command_osd_destroy(op, id, ss);
+ if (err < 0) {
+ if (err == -ENOENT) {
+ err = 0;
+ } else {
+ return err;
+ }
+ } else {
+ may_be_idempotent = false;
+ }
+ }
+ ceph_assert(0 == err);
+
+ if (may_be_idempotent && !osdmap.exists(id)) {
+ dout(10) << __func__ << " osd." << id << " does not exist and "
+ << "we are idempotent." << dendl;
+ return -ENOENT;
+ }
+
+ err = prepare_command_osd_remove(id);
+ // we should not be busy, as we should have made sure this id is not up.
+ ceph_assert(0 == err);
+
+ do_osd_crush_remove(newcrush);
+ return 0;
+}
+
+int OSDMonitor::parse_pgid(const cmdmap_t& cmdmap, stringstream &ss,
+ /* out */ pg_t &pgid, std::optional<string> pgids) {
+ string pgidstr;
+ if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
+ ss << "unable to parse 'pgid' value '"
+ << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
+ return -EINVAL;
+ }
+ if (!pgid.parse(pgidstr.c_str())) {
+ ss << "invalid pgid '" << pgidstr << "'";
+ return -EINVAL;
+ }
+ if (!osdmap.pg_exists(pgid)) {
+ ss << "pgid '" << pgid << "' does not exist";
+ return -ENOENT;
+ }
+ if (pgids.has_value())
+ pgids.value() = pgidstr;
+ return 0;
+}
+
+bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
+ const cmdmap_t& cmdmap)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MMonCommand>();
+ stringstream ss;
+ string rs;
+ bufferlist rdata;
+ int err = 0;
+
+ string format = cmd_getval_or<string>(cmdmap, "format", "plain");
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+ string prefix;
+ cmd_getval(cmdmap, "prefix", prefix);
+
+ int64_t osdid;
+ string osd_name;
+ bool osdid_present = false;
+ if (prefix != "osd pg-temp" &&
+ prefix != "osd pg-upmap" &&
+ prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
+ osdid_present = cmd_getval(cmdmap, "id", osdid);
+ }
+ if (osdid_present) {
+ ostringstream oss;
+ oss << "osd." << osdid;
+ osd_name = oss.str();
+ }
+
+ // Even if there's a pending state with changes that could affect
+ // a command, considering that said state isn't yet committed, we
+ // just don't care about those changes if the command currently being
+ // handled acts as a no-op against the current committed state.
+ // In a nutshell, we assume this command happens *before*.
+ //
+ // Let me make this clearer:
+ //
+ // - If we have only one client, and that client issues some
+ // operation that would conflict with this operation but is
+ // still on the pending state, then we would be sure that said
+ // operation wouldn't have returned yet, so the client wouldn't
+ // issue this operation (unless the client didn't wait for the
+ // operation to finish, and that would be the client's own fault).
+ //
+ // - If we have more than one client, each client will observe
+ // whatever is the state at the moment of the commit. So, if we
+ // have two clients, one issuing an unlink and another issuing a
+ // link, and if the link happens while the unlink is still on the
+ // pending state, from the link's point-of-view this is a no-op.
+ // If different clients are issuing conflicting operations and
+ // they care about that, then the clients should make sure they
+ // enforce some kind of concurrency mechanism -- from our
+ // perspective that's what Douglas Adams would call an SEP.
+ //
+ // This should be used as a general guideline for most commands handled
+ // in this function. Adapt as you see fit, but please bear in mind that
+ // this is the expected behavior.
+
+
+ if (prefix == "osd setcrushmap" ||
+ (prefix == "osd crush set" && !osdid_present)) {
+ if (pending_inc.crush.length()) {
+ dout(10) << __func__ << " waiting for pending crush update " << dendl;
+ goto wait;
+ }
+ dout(10) << "prepare_command setting new crush map" << dendl;
+ bufferlist data(m->get_data());
+ CrushWrapper crush;
+ try {
+ auto bl = data.cbegin();
+ crush.decode(bl);
+ }
+ catch (const std::exception &e) {
+ err = -EINVAL;
+ ss << "Failed to parse crushmap: " << e.what();
+ goto reply_no_propose;
+ }
+
+ int64_t prior_version = 0;
+ if (cmd_getval(cmdmap, "prior_version", prior_version)) {
+ if (prior_version == osdmap.get_crush_version() - 1) {
+ // see if we are a resend of the last update. this is imperfect
+ // (multiple racing updaters may not both get reliable success)
+ // but we expect crush updaters (via this interface) to be rare-ish.
+ bufferlist current, proposed;
+ osdmap.crush->encode(current, mon.get_quorum_con_features());
+ crush.encode(proposed, mon.get_quorum_con_features());
+ if (current.contents_equal(proposed)) {
+ dout(10) << __func__
+ << " proposed matches current and version equals previous"
+ << dendl;
+ err = 0;
+ ss << osdmap.get_crush_version();
+ goto reply_no_propose;
+ }
+ }
+ if (prior_version != osdmap.get_crush_version()) {
+ err = -EPERM;
+ ss << "prior_version " << prior_version << " != crush version "
+ << osdmap.get_crush_version();
+ goto reply_no_propose;
+ }
+ }
+
+ if (!validate_crush_against_features(&crush, ss)) {
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ err = osdmap.validate_crush_rules(&crush, &ss);
+ if (err < 0) {
+ goto reply_no_propose;
+ }
+
+ if (g_conf()->mon_osd_crush_smoke_test) {
+ // sanity check: test some inputs to make sure this map isn't
+ // totally broken
+ dout(10) << " testing map" << dendl;
+ stringstream ess;
+ CrushTester tester(crush, ess);
+ tester.set_min_x(0);
+ tester.set_max_x(50);
+ tester.set_num_rep(3); // arbitrary
+ auto start = ceph::coarse_mono_clock::now();
+ int r = tester.test_with_fork(cct, g_conf()->mon_lease);
+ auto duration = ceph::coarse_mono_clock::now() - start;
+ if (r < 0) {
+ dout(10) << " tester.test_with_fork returns " << r
+ << ": " << ess.str() << dendl;
+ ss << "crush smoke test failed with " << r << ": " << ess.str();
+ err = r;
+ goto reply_no_propose;
+ }
+ dout(10) << __func__ << " crush somke test duration: "
+ << duration << ", result: " << ess.str() << dendl;
+ }
+
+ pending_inc.crush = data;
+ ss << osdmap.get_crush_version() + 1;
+ goto update;
+
+ } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
+ CrushWrapper newcrush = _get_pending_crush();
+ for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
+ int bid = -1 - b;
+ if (newcrush.bucket_exists(bid) &&
+ newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
+ dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
+ newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
+ }
+ }
+ if (!validate_crush_against_features(&newcrush, ss)) {
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush set-device-class") {
+ string device_class;
+ if (!cmd_getval(cmdmap, "class", device_class)) {
+ err = -EINVAL; // no value!
+ goto reply_no_propose;
+ }
+
+ bool stop = false;
+ vector<string> idvec;
+ cmd_getval(cmdmap, "ids", idvec);
+ CrushWrapper newcrush = _get_pending_crush();
+ set<int> updated;
+ for (unsigned j = 0; j < idvec.size() && !stop; j++) {
+ set<int> osds;
+ // wildcard?
+ if (j == 0 &&
+ (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
+ osdmap.get_all_osds(osds);
+ stop = true;
+ } else {
+ // try traditional single osd way
+ long osd = parse_osd_id(idvec[j].c_str(), &ss);
+ if (osd < 0) {
+ // ss has reason for failure
+ ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
+ err = -EINVAL;
+ continue;
+ }
+ osds.insert(osd);
+ }
+
+ for (auto &osd : osds) {
+ if (!osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist. ";
+ continue;
+ }
+
+ ostringstream oss;
+ oss << "osd." << osd;
+ string name = oss.str();
+
+ if (newcrush.get_max_devices() < osd + 1) {
+ newcrush.set_max_devices(osd + 1);
+ }
+ string action;
+ if (newcrush.item_exists(osd)) {
+ action = "updating";
+ } else {
+ action = "creating";
+ newcrush.set_item_name(osd, name);
+ }
+
+ dout(5) << action << " crush item id " << osd << " name '" << name
+ << "' device_class '" << device_class << "'"
+ << dendl;
+ err = newcrush.update_device_class(osd, device_class, name, &ss);
+ if (err < 0) {
+ goto reply_no_propose;
+ }
+ if (err == 0 && !_have_pending_crush()) {
+ if (!stop) {
+ // for single osd only, wildcard makes too much noise
+ ss << "set-device-class item id " << osd << " name '" << name
+ << "' device_class '" << device_class << "': no change. ";
+ }
+ } else {
+ updated.insert(osd);
+ }
+ }
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "set osd(s) " << updated << " to class '" << device_class << "'";
+ getline(ss, rs);
+ wait_for_commit(
+ op,
+ new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush rm-device-class") {
+ bool stop = false;
+ vector<string> idvec;
+ cmd_getval(cmdmap, "ids", idvec);
+ CrushWrapper newcrush = _get_pending_crush();
+ set<int> updated;
+
+ for (unsigned j = 0; j < idvec.size() && !stop; j++) {
+ set<int> osds;
+
+ // wildcard?
+ if (j == 0 &&
+ (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
+ osdmap.get_all_osds(osds);
+ stop = true;
+ } else {
+ // try traditional single osd way
+ long osd = parse_osd_id(idvec[j].c_str(), &ss);
+ if (osd < 0) {
+ // ss has reason for failure
+ ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ osds.insert(osd);
+ }
+
+ for (auto &osd : osds) {
+ if (!osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist. ";
+ continue;
+ }
+
+ auto class_name = newcrush.get_item_class(osd);
+ if (!class_name) {
+ ss << "osd." << osd << " belongs to no class, ";
+ continue;
+ }
+ // note that we do not verify if class_is_in_use here
+ // in case the device is misclassified and user wants
+ // to overridely reset...
+
+ err = newcrush.remove_device_class(cct, osd, &ss);
+ if (err < 0) {
+ // ss has reason for failure
+ goto reply_no_propose;
+ }
+ updated.insert(osd);
+ }
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "done removing class of osd(s): " << updated;
+ getline(ss, rs);
+ wait_for_commit(
+ op,
+ new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush class create") {
+ string device_class;
+ if (!cmd_getval(cmdmap, "class", device_class)) {
+ err = -EINVAL; // no value!
+ goto reply_no_propose;
+ }
+ if (osdmap.require_osd_release < ceph_release_t::luminous) {
+ ss << "you must complete the upgrade and 'ceph osd require-osd-release "
+ << "luminous' before using crush device classes";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ if (!_have_pending_crush() &&
+ _get_stable_crush().class_exists(device_class)) {
+ ss << "class '" << device_class << "' already exists";
+ goto reply_no_propose;
+ }
+ CrushWrapper newcrush = _get_pending_crush();
+ if (newcrush.class_exists(device_class)) {
+ ss << "class '" << device_class << "' already exists";
+ goto update;
+ }
+ int class_id = newcrush.get_or_create_class_id(device_class);
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "created class " << device_class << " with id " << class_id
+ << " to crush map";
+ goto update;
+ } else if (prefix == "osd crush class rm") {
+ string device_class;
+ if (!cmd_getval(cmdmap, "class", device_class)) {
+ err = -EINVAL; // no value!
+ goto reply_no_propose;
+ }
+ if (osdmap.require_osd_release < ceph_release_t::luminous) {
+ ss << "you must complete the upgrade and 'ceph osd require-osd-release "
+ << "luminous' before using crush device classes";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+
+ if (!osdmap.crush->class_exists(device_class)) {
+ err = 0;
+ goto reply_no_propose;
+ }
+
+ CrushWrapper newcrush = _get_pending_crush();
+ if (!newcrush.class_exists(device_class)) {
+ err = 0; // make command idempotent
+ goto wait;
+ }
+ int class_id = newcrush.get_class_id(device_class);
+ stringstream ts;
+ if (newcrush.class_is_in_use(class_id, &ts)) {
+ err = -EBUSY;
+ ss << "class '" << device_class << "' " << ts.str();
+ goto reply_no_propose;
+ }
+
+ // check if class is used by any erasure-code-profiles
+ mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
+ osdmap.get_erasure_code_profiles();
+ auto ec_profiles = pending_inc.get_erasure_code_profiles();
+#ifdef HAVE_STDLIB_MAP_SPLICING
+ ec_profiles.merge(old_ec_profiles);
+#else
+ ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
+ make_move_iterator(end(old_ec_profiles)));
+#endif
+ list<string> referenced_by;
+ for (auto &i: ec_profiles) {
+ for (auto &j: i.second) {
+ if ("crush-device-class" == j.first && device_class == j.second) {
+ referenced_by.push_back(i.first);
+ }
+ }
+ }
+ if (!referenced_by.empty()) {
+ err = -EBUSY;
+ ss << "class '" << device_class
+ << "' is still referenced by erasure-code-profile(s): " << referenced_by;
+ goto reply_no_propose;
+ }
+
+ set<int> osds;
+ newcrush.get_devices_by_class(device_class, &osds);
+ for (auto& p: osds) {
+ err = newcrush.remove_device_class(cct, p, &ss);
+ if (err < 0) {
+ // ss has reason for failure
+ goto reply_no_propose;
+ }
+ }
+
+ if (osds.empty()) {
+ // empty class, remove directly
+ err = newcrush.remove_class_name(device_class);
+ if (err < 0) {
+ ss << "class '" << device_class << "' cannot be removed '"
+ << cpp_strerror(err) << "'";
+ goto reply_no_propose;
+ }
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "removed class " << device_class << " with id " << class_id
+ << " from crush map";
+ goto update;
+ } else if (prefix == "osd crush class rename") {
+ string srcname, dstname;
+ if (!cmd_getval(cmdmap, "srcname", srcname)) {
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (!cmd_getval(cmdmap, "dstname", dstname)) {
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ CrushWrapper newcrush = _get_pending_crush();
+ if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
+ // suppose this is a replay and return success
+ // so command is idempotent
+ ss << "already renamed to '" << dstname << "'";
+ err = 0;
+ goto reply_no_propose;
+ }
+
+ err = newcrush.rename_class(srcname, dstname);
+ if (err < 0) {
+ ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
+ << cpp_strerror(err);
+ goto reply_no_propose;
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "rename class '" << srcname << "' to '" << dstname << "'";
+ goto update;
+ } else if (prefix == "osd crush add-bucket") {
+ // os crush add-bucket <name> <type>
+ string name, typestr;
+ vector<string> argvec;
+ cmd_getval(cmdmap, "name", name);
+ cmd_getval(cmdmap, "type", typestr);
+ cmd_getval(cmdmap, "args", argvec);
+ map<string,string> loc;
+ if (!argvec.empty()) {
+ CrushWrapper::parse_loc_map(argvec, &loc);
+ dout(0) << "will create and move bucket '" << name
+ << "' to location " << loc << dendl;
+ }
+
+ if (!_have_pending_crush() &&
+ _get_stable_crush().name_exists(name)) {
+ ss << "bucket '" << name << "' already exists";
+ goto reply_no_propose;
+ }
+
+ CrushWrapper newcrush = _get_pending_crush();
+
+ if (newcrush.name_exists(name)) {
+ ss << "bucket '" << name << "' already exists";
+ goto update;
+ }
+ int type = newcrush.get_type_id(typestr);
+ if (type < 0) {
+ ss << "type '" << typestr << "' does not exist";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (type == 0) {
+ ss << "type '" << typestr << "' is for devices, not buckets";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ int bucketno;
+ err = newcrush.add_bucket(0, 0,
+ CRUSH_HASH_DEFAULT, type, 0, NULL,
+ NULL, &bucketno);
+ if (err < 0) {
+ ss << "add_bucket error: '" << cpp_strerror(err) << "'";
+ goto reply_no_propose;
+ }
+ err = newcrush.set_item_name(bucketno, name);
+ if (err < 0) {
+ ss << "error setting bucket name to '" << name << "'";
+ goto reply_no_propose;
+ }
+
+ if (!loc.empty()) {
+ if (!newcrush.check_item_loc(cct, bucketno, loc,
+ (int *)NULL)) {
+ err = newcrush.move_bucket(cct, bucketno, loc);
+ if (err < 0) {
+ ss << "error moving bucket '" << name << "' to location " << loc;
+ goto reply_no_propose;
+ }
+ } else {
+ ss << "no need to move item id " << bucketno << " name '" << name
+ << "' to location " << loc << " in crush map";
+ }
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ if (loc.empty()) {
+ ss << "added bucket " << name << " type " << typestr
+ << " to crush map";
+ } else {
+ ss << "added bucket " << name << " type " << typestr
+ << " to location " << loc;
+ }
+ goto update;
+ } else if (prefix == "osd crush rename-bucket") {
+ string srcname, dstname;
+ cmd_getval(cmdmap, "srcname", srcname);
+ cmd_getval(cmdmap, "dstname", dstname);
+
+ err = crush_rename_bucket(srcname, dstname, &ss);
+ if (err) {
+ // equivalent to success for idempotency
+ if (err == -EALREADY) {
+ err = 0;
+ }
+ goto reply_no_propose;
+ } else {
+ goto update;
+ }
+ } else if (prefix == "osd crush weight-set create" ||
+ prefix == "osd crush weight-set create-compat") {
+ if (_have_pending_crush()) {
+ dout(10) << " first waiting for pending crush changes to commit" << dendl;
+ goto wait;
+ }
+ CrushWrapper newcrush = _get_pending_crush();
+ int64_t pool;
+ int positions;
+ if (newcrush.has_non_straw2_buckets()) {
+ ss << "crush map contains one or more bucket(s) that are not straw2";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ if (prefix == "osd crush weight-set create") {
+ if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
+ osdmap.require_min_compat_client < ceph_release_t::luminous) {
+ ss << "require_min_compat_client "
+ << osdmap.require_min_compat_client
+ << " < luminous, which is required for per-pool weight-sets. "
+ << "Try 'ceph osd set-require-min-compat-client luminous' "
+ << "before using the new interface";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ string poolname, mode;
+ cmd_getval(cmdmap, "pool", poolname);
+ pool = osdmap.lookup_pg_pool_name(poolname.c_str());
+ if (pool < 0) {
+ ss << "pool '" << poolname << "' not found";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ cmd_getval(cmdmap, "mode", mode);
+ if (mode != "flat" && mode != "positional") {
+ ss << "unrecognized weight-set mode '" << mode << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
+ } else {
+ pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
+ positions = 1;
+ }
+ if (!newcrush.create_choose_args(pool, positions)) {
+ if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
+ ss << "compat weight-set already created";
+ } else {
+ ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
+ << "' already created";
+ }
+ goto reply_no_propose;
+ }
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ goto update;
+
+ } else if (prefix == "osd crush weight-set rm" ||
+ prefix == "osd crush weight-set rm-compat") {
+ CrushWrapper newcrush = _get_pending_crush();
+ int64_t pool;
+ if (prefix == "osd crush weight-set rm") {
+ string poolname;
+ cmd_getval(cmdmap, "pool", poolname);
+ pool = osdmap.lookup_pg_pool_name(poolname.c_str());
+ if (pool < 0) {
+ ss << "pool '" << poolname << "' not found";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ } else {
+ pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
+ }
+ newcrush.rm_choose_args(pool);
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ goto update;
+
+ } else if (prefix == "osd crush weight-set reweight" ||
+ prefix == "osd crush weight-set reweight-compat") {
+ string poolname, item;
+ vector<double> weight;
+ cmd_getval(cmdmap, "pool", poolname);
+ cmd_getval(cmdmap, "item", item);
+ cmd_getval(cmdmap, "weight", weight);
+ CrushWrapper newcrush = _get_pending_crush();
+ int64_t pool;
+ if (prefix == "osd crush weight-set reweight") {
+ pool = osdmap.lookup_pg_pool_name(poolname.c_str());
+ if (pool < 0) {
+ ss << "pool '" << poolname << "' not found";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ if (!newcrush.have_choose_args(pool)) {
+ ss << "no weight-set for pool '" << poolname << "'";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ auto arg_map = newcrush.choose_args_get(pool);
+ int positions = newcrush.get_choose_args_positions(arg_map);
+ if (weight.size() != (size_t)positions) {
+ ss << "must specify exact " << positions << " weight values";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ } else {
+ pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
+ if (!newcrush.have_choose_args(pool)) {
+ ss << "no backward-compatible weight-set";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ }
+ if (!newcrush.name_exists(item)) {
+ ss << "item '" << item << "' does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ err = newcrush.choose_args_adjust_item_weightf(
+ cct,
+ newcrush.choose_args_get(pool),
+ newcrush.get_item_id(item),
+ weight,
+ &ss);
+ if (err < 0) {
+ goto reply_no_propose;
+ }
+ err = 0;
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ goto update;
+ } else if (osdid_present &&
+ (prefix == "osd crush set" || prefix == "osd crush add")) {
+ // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
+ // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
+ // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
+
+ if (!osdmap.exists(osdid)) {
+ err = -ENOENT;
+ ss << osd_name
+ << " does not exist. Create it before updating the crush map";
+ goto reply_no_propose;
+ }
+
+ double weight;
+ if (!cmd_getval(cmdmap, "weight", weight)) {
+ ss << "unable to parse weight value '"
+ << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ string args;
+ vector<string> argvec;
+ cmd_getval(cmdmap, "args", argvec);
+ map<string,string> loc;
+ CrushWrapper::parse_loc_map(argvec, &loc);
+
+ if (prefix == "osd crush set"
+ && !_get_stable_crush().item_exists(osdid)) {
+ err = -ENOENT;
+ ss << "unable to set item id " << osdid << " name '" << osd_name
+ << "' weight " << weight << " at location " << loc
+ << ": does not exist";
+ goto reply_no_propose;
+ }
+
+ dout(5) << "adding/updating crush item id " << osdid << " name '"
+ << osd_name << "' weight " << weight << " at location "
+ << loc << dendl;
+ CrushWrapper newcrush = _get_pending_crush();
+
+ string action;
+ if (prefix == "osd crush set" ||
+ newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
+ action = "set";
+ err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
+ } else {
+ action = "add";
+ err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
+ if (err == 0)
+ err = 1;
+ }
+
+ if (err < 0)
+ goto reply_no_propose;
+
+ if (err == 0 && !_have_pending_crush()) {
+ ss << action << " item id " << osdid << " name '" << osd_name
+ << "' weight " << weight << " at location " << loc << ": no change";
+ goto reply_no_propose;
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
+ << weight << " at location " << loc << " to crush map";
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd crush create-or-move") {
+ do {
+ // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
+ if (!osdmap.exists(osdid)) {
+ err = -ENOENT;
+ ss << osd_name
+ << " does not exist. create it before updating the crush map";
+ goto reply_no_propose;
+ }
+
+ double weight;
+ if (!cmd_getval(cmdmap, "weight", weight)) {
+ ss << "unable to parse weight value '"
+ << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ string args;
+ vector<string> argvec;
+ cmd_getval(cmdmap, "args", argvec);
+ map<string,string> loc;
+ CrushWrapper::parse_loc_map(argvec, &loc);
+
+ dout(0) << "create-or-move crush item name '" << osd_name
+ << "' initial_weight " << weight << " at location " << loc
+ << dendl;
+
+ CrushWrapper newcrush = _get_pending_crush();
+
+ err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
+ g_conf()->osd_crush_update_weight_set);
+ if (err == 0) {
+ ss << "create-or-move updated item name '" << osd_name
+ << "' weight " << weight
+ << " at location " << loc << " to crush map";
+ break;
+ }
+ if (err > 0) {
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "create-or-move updating item name '" << osd_name
+ << "' weight " << weight
+ << " at location " << loc << " to crush map";
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+ } while (false);
+
+ } else if (prefix == "osd crush move") {
+ do {
+ // osd crush move <name> <loc1> [<loc2> ...]
+ string name;
+ vector<string> argvec;
+ cmd_getval(cmdmap, "name", name);
+ cmd_getval(cmdmap, "args", argvec);
+ map<string,string> loc;
+ CrushWrapper::parse_loc_map(argvec, &loc);
+
+ dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
+ CrushWrapper newcrush = _get_pending_crush();
+
+ if (!newcrush.name_exists(name)) {
+ err = -ENOENT;
+ ss << "item " << name << " does not exist";
+ break;
+ }
+ int id = newcrush.get_item_id(name);
+
+ if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
+ if (id >= 0) {
+ err = newcrush.create_or_move_item(
+ cct, id, 0, name, loc,
+ g_conf()->osd_crush_update_weight_set);
+ } else {
+ err = newcrush.move_bucket(cct, id, loc);
+ }
+ if (err >= 0) {
+ ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+ } else {
+ ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
+ err = 0;
+ }
+ } while (false);
+ } else if (prefix == "osd crush swap-bucket") {
+ string source, dest;
+ cmd_getval(cmdmap, "source", source);
+ cmd_getval(cmdmap, "dest", dest);
+
+ bool force = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", force);
+
+ CrushWrapper newcrush = _get_pending_crush();
+ if (!newcrush.name_exists(source)) {
+ ss << "source item " << source << " does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ if (!newcrush.name_exists(dest)) {
+ ss << "dest item " << dest << " does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ int sid = newcrush.get_item_id(source);
+ int did = newcrush.get_item_id(dest);
+ int sparent;
+ if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
+ ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
+ !force) {
+ ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
+ << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
+ << "; pass --yes-i-really-mean-it to proceed anyway";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ int r = newcrush.swap_bucket(cct, sid, did);
+ if (r < 0) {
+ ss << "failed to swap bucket contents: " << cpp_strerror(r);
+ err = r;
+ goto reply_no_propose;
+ }
+ ss << "swapped bucket of " << source << " to " << dest;
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ wait_for_commit(op,
+ new Monitor::C_Command(mon, op, err, ss.str(),
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush link") {
+ // osd crush link <name> <loc1> [<loc2> ...]
+ string name;
+ cmd_getval(cmdmap, "name", name);
+ vector<string> argvec;
+ cmd_getval(cmdmap, "args", argvec);
+ map<string,string> loc;
+ CrushWrapper::parse_loc_map(argvec, &loc);
+
+ // Need an explicit check for name_exists because get_item_id returns
+ // 0 on unfound.
+ int id = osdmap.crush->get_item_id(name);
+ if (!osdmap.crush->name_exists(name)) {
+ err = -ENOENT;
+ ss << "item " << name << " does not exist";
+ goto reply_no_propose;
+ } else {
+ dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
+ }
+ if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
+ ss << "no need to move item id " << id << " name '" << name
+ << "' to location " << loc << " in crush map";
+ err = 0;
+ goto reply_no_propose;
+ }
+
+ dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
+ CrushWrapper newcrush = _get_pending_crush();
+
+ if (!newcrush.name_exists(name)) {
+ err = -ENOENT;
+ ss << "item " << name << " does not exist";
+ goto reply_no_propose;
+ } else {
+ int id = newcrush.get_item_id(name);
+ if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
+ err = newcrush.link_bucket(cct, id, loc);
+ if (err >= 0) {
+ ss << "linked item id " << id << " name '" << name
+ << "' to location " << loc << " in crush map";
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ } else {
+ ss << "cannot link item id " << id << " name '" << name
+ << "' to location " << loc;
+ goto reply_no_propose;
+ }
+ } else {
+ ss << "no need to move item id " << id << " name '" << name
+ << "' to location " << loc << " in crush map";
+ err = 0;
+ }
+ }
+ wait_for_commit(op, new Monitor::C_Command(mon, op, err, ss.str(),
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush rm" ||
+ prefix == "osd crush remove" ||
+ prefix == "osd crush unlink") {
+ do {
+ // osd crush rm <id> [ancestor]
+ CrushWrapper newcrush = _get_pending_crush();
+
+ string name;
+ cmd_getval(cmdmap, "name", name);
+
+ if (!osdmap.crush->name_exists(name)) {
+ err = 0;
+ ss << "device '" << name << "' does not appear in the crush map";
+ break;
+ }
+ if (!newcrush.name_exists(name)) {
+ err = 0;
+ ss << "device '" << name << "' does not appear in the crush map";
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+ int id = newcrush.get_item_id(name);
+ int ancestor = 0;
+
+ bool unlink_only = prefix == "osd crush unlink";
+ string ancestor_str;
+ if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
+ if (!newcrush.name_exists(ancestor_str)) {
+ err = -ENOENT;
+ ss << "ancestor item '" << ancestor_str
+ << "' does not appear in the crush map";
+ break;
+ }
+ ancestor = newcrush.get_item_id(ancestor_str);
+ }
+
+ err = prepare_command_osd_crush_remove(
+ newcrush,
+ id, ancestor,
+ (ancestor < 0), unlink_only);
+
+ if (err == -ENOENT) {
+ ss << "item " << id << " does not appear in that position";
+ err = 0;
+ break;
+ }
+ if (err == 0) {
+ if (!unlink_only)
+ pending_inc.new_crush_node_flags[id] = 0;
+ ss << "removed item id " << id << " name '" << name << "' from crush map";
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+ } while (false);
+
+ } else if (prefix == "osd crush reweight-all") {
+ CrushWrapper newcrush = _get_pending_crush();
+
+ newcrush.reweight(cct);
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "reweighted crush hierarchy";
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush reweight") {
+ // osd crush reweight <name> <weight>
+ CrushWrapper newcrush = _get_pending_crush();
+
+ string name;
+ cmd_getval(cmdmap, "name", name);
+ if (!newcrush.name_exists(name)) {
+ err = -ENOENT;
+ ss << "device '" << name << "' does not appear in the crush map";
+ goto reply_no_propose;
+ }
+
+ int id = newcrush.get_item_id(name);
+ if (id < 0) {
+ ss << "device '" << name << "' is not a leaf in the crush map";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ double w;
+ if (!cmd_getval(cmdmap, "weight", w)) {
+ ss << "unable to parse weight value '"
+ << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ err = newcrush.adjust_item_weightf(cct, id, w,
+ g_conf()->osd_crush_update_weight_set);
+ if (err < 0)
+ goto reply_no_propose;
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "reweighted item id " << id << " name '" << name << "' to " << w
+ << " in crush map";
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush reweight-subtree") {
+ // osd crush reweight <name> <weight>
+ CrushWrapper newcrush = _get_pending_crush();
+
+ string name;
+ cmd_getval(cmdmap, "name", name);
+ if (!newcrush.name_exists(name)) {
+ err = -ENOENT;
+ ss << "device '" << name << "' does not appear in the crush map";
+ goto reply_no_propose;
+ }
+
+ int id = newcrush.get_item_id(name);
+ if (id >= 0) {
+ ss << "device '" << name << "' is not a subtree in the crush map";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ double w;
+ if (!cmd_getval(cmdmap, "weight", w)) {
+ ss << "unable to parse weight value '"
+ << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ err = newcrush.adjust_subtree_weightf(cct, id, w,
+ g_conf()->osd_crush_update_weight_set);
+ if (err < 0)
+ goto reply_no_propose;
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
+ << " in crush map";
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush tunables") {
+ CrushWrapper newcrush = _get_pending_crush();
+
+ err = 0;
+ string profile;
+ cmd_getval(cmdmap, "profile", profile);
+ if (profile == "legacy" || profile == "argonaut") {
+ newcrush.set_tunables_legacy();
+ } else if (profile == "bobtail") {
+ newcrush.set_tunables_bobtail();
+ } else if (profile == "firefly") {
+ newcrush.set_tunables_firefly();
+ } else if (profile == "hammer") {
+ newcrush.set_tunables_hammer();
+ } else if (profile == "jewel") {
+ newcrush.set_tunables_jewel();
+ } else if (profile == "optimal") {
+ newcrush.set_tunables_optimal();
+ } else if (profile == "default") {
+ newcrush.set_tunables_default();
+ } else {
+ ss << "unrecognized profile '" << profile << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ if (!validate_crush_against_features(&newcrush, ss)) {
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "adjusted tunables profile to " << profile;
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush set-tunable") {
+ CrushWrapper newcrush = _get_pending_crush();
+
+ err = 0;
+ string tunable;
+ cmd_getval(cmdmap, "tunable", tunable);
+
+ int64_t value = -1;
+ if (!cmd_getval(cmdmap, "value", value)) {
+ err = -EINVAL;
+ ss << "failed to parse integer value "
+ << cmd_vartype_stringify(cmdmap.at("value"));
+ goto reply_no_propose;
+ }
+
+ if (tunable == "straw_calc_version") {
+ if (value != 0 && value != 1) {
+ ss << "value must be 0 or 1; got " << value;
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ newcrush.set_straw_calc_version(value);
+ } else {
+ ss << "unrecognized tunable '" << tunable << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ if (!validate_crush_against_features(&newcrush, ss)) {
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "adjusted tunable " << tunable << " to " << value;
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd crush rule create-simple") {
+ string name, root, type, mode;
+ cmd_getval(cmdmap, "name", name);
+ cmd_getval(cmdmap, "root", root);
+ cmd_getval(cmdmap, "type", type);
+ cmd_getval(cmdmap, "mode", mode);
+ if (mode == "")
+ mode = "firstn";
+
+ if (osdmap.crush->rule_exists(name)) {
+ // The name is uniquely associated to a ruleid and the rule it contains
+ // From the user point of view, the rule is more meaningfull.
+ ss << "rule " << name << " already exists";
+ err = 0;
+ goto reply_no_propose;
+ }
+
+ CrushWrapper newcrush = _get_pending_crush();
+
+ if (newcrush.rule_exists(name)) {
+ // The name is uniquely associated to a ruleid and the rule it contains
+ // From the user point of view, the rule is more meaningfull.
+ ss << "rule " << name << " already exists";
+ err = 0;
+ } else {
+ int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
+ pg_pool_t::TYPE_REPLICATED, &ss);
+ if (ruleno < 0) {
+ err = ruleno;
+ goto reply_no_propose;
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ }
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd crush rule create-replicated") {
+ string name, root, type, device_class;
+ cmd_getval(cmdmap, "name", name);
+ cmd_getval(cmdmap, "root", root);
+ cmd_getval(cmdmap, "type", type);
+ cmd_getval(cmdmap, "class", device_class);
+
+ if (osdmap.crush->rule_exists(name)) {
+ // The name is uniquely associated to a ruleid and the rule it contains
+ // From the user point of view, the rule is more meaningfull.
+ ss << "rule " << name << " already exists";
+ err = 0;
+ goto reply_no_propose;
+ }
+
+ CrushWrapper newcrush = _get_pending_crush();
+
+ if (newcrush.rule_exists(name)) {
+ // The name is uniquely associated to a ruleid and the rule it contains
+ // From the user point of view, the rule is more meaningfull.
+ ss << "rule " << name << " already exists";
+ err = 0;
+ } else {
+ int ruleno = newcrush.add_simple_rule(
+ name, root, type, device_class,
+ "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
+ if (ruleno < 0) {
+ err = ruleno;
+ goto reply_no_propose;
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ }
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd erasure-code-profile rm") {
+ string name;
+ cmd_getval(cmdmap, "name", name);
+
+ if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
+ goto wait;
+
+ if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
+ err = -EBUSY;
+ goto reply_no_propose;
+ }
+
+ if (osdmap.has_erasure_code_profile(name) ||
+ pending_inc.new_erasure_code_profiles.count(name)) {
+ if (osdmap.has_erasure_code_profile(name)) {
+ pending_inc.old_erasure_code_profiles.push_back(name);
+ } else {
+ dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
+ pending_inc.new_erasure_code_profiles.erase(name);
+ }
+
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else {
+ ss << "erasure-code-profile " << name << " does not exist";
+ err = 0;
+ goto reply_no_propose;
+ }
+
+ } else if (prefix == "osd erasure-code-profile set") {
+ string name;
+ cmd_getval(cmdmap, "name", name);
+ vector<string> profile;
+ cmd_getval(cmdmap, "profile", profile);
+
+ bool force = false;
+ cmd_getval(cmdmap, "force", force);
+
+ map<string,string> profile_map;
+ err = parse_erasure_code_profile(profile, &profile_map, &ss);
+ if (err)
+ goto reply_no_propose;
+ if (auto found = profile_map.find("crush-failure-domain");
+ found != profile_map.end()) {
+ const auto& failure_domain = found->second;
+ int failure_domain_type = osdmap.crush->get_type_id(failure_domain);
+ if (failure_domain_type < 0) {
+ ss << "erasure-code-profile " << profile_map
+ << " contains an invalid failure-domain " << std::quoted(failure_domain);
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ }
+
+ if (profile_map.find("plugin") == profile_map.end()) {
+ ss << "erasure-code-profile " << profile_map
+ << " must contain a plugin entry" << std::endl;
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ string plugin = profile_map["plugin"];
+
+ if (pending_inc.has_erasure_code_profile(name)) {
+ dout(20) << "erasure code profile " << name << " try again" << dendl;
+ goto wait;
+ } else {
+ err = normalize_profile(name, profile_map, force, &ss);
+ if (err)
+ goto reply_no_propose;
+
+ if (osdmap.has_erasure_code_profile(name)) {
+ ErasureCodeProfile existing_profile_map =
+ osdmap.get_erasure_code_profile(name);
+ err = normalize_profile(name, existing_profile_map, force, &ss);
+ if (err)
+ goto reply_no_propose;
+
+ if (existing_profile_map == profile_map) {
+ err = 0;
+ goto reply_no_propose;
+ }
+ bool force_no_fake = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", force_no_fake);
+ if (!force) {
+ err = -EPERM;
+ ss << "will not override erasure code profile " << name
+ << " because the existing profile "
+ << existing_profile_map
+ << " is different from the proposed profile "
+ << profile_map;
+ goto reply_no_propose;
+ } else if (!force_no_fake) {
+ err = -EPERM;
+ ss << "overriding erasure code profile can be DANGEROUS"
+ << "; add --yes-i-really-mean-it to do it anyway";
+ goto reply_no_propose;
+ }
+ }
+
+ dout(20) << "erasure code profile set " << name << "="
+ << profile_map << dendl;
+ pending_inc.set_erasure_code_profile(name, profile_map);
+ }
+
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd crush rule create-erasure") {
+ err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
+ if (err == -EAGAIN)
+ goto wait;
+ if (err)
+ goto reply_no_propose;
+ string name, poolstr;
+ cmd_getval(cmdmap, "name", name);
+ string profile;
+ cmd_getval(cmdmap, "profile", profile);
+ if (profile == "")
+ profile = "default";
+ if (profile == "default") {
+ if (!osdmap.has_erasure_code_profile(profile)) {
+ if (pending_inc.has_erasure_code_profile(profile)) {
+ dout(20) << "erasure code profile " << profile << " already pending" << dendl;
+ goto wait;
+ }
+
+ map<string,string> profile_map;
+ err = osdmap.get_erasure_code_profile_default(cct,
+ profile_map,
+ &ss);
+ if (err)
+ goto reply_no_propose;
+ err = normalize_profile(name, profile_map, true, &ss);
+ if (err)
+ goto reply_no_propose;
+ dout(20) << "erasure code profile set " << profile << "="
+ << profile_map << dendl;
+ pending_inc.set_erasure_code_profile(profile, profile_map);
+ goto wait;
+ }
+ }
+
+ int rule;
+ err = crush_rule_create_erasure(name, profile, &rule, &ss);
+ if (err < 0) {
+ switch(err) {
+ case -EEXIST: // return immediately
+ ss << "rule " << name << " already exists";
+ err = 0;
+ goto reply_no_propose;
+ case -EALREADY: // wait for pending to be proposed
+ ss << "rule " << name << " already exists";
+ err = 0;
+ break;
+ default: // non recoverable error
+ goto reply_no_propose;
+ }
+ } else {
+ ss << "created rule " << name << " at " << rule;
+ }
+
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd crush rule rm") {
+ string name;
+ cmd_getval(cmdmap, "name", name);
+
+ if (!osdmap.crush->rule_exists(name)) {
+ ss << "rule " << name << " does not exist";
+ err = 0;
+ goto reply_no_propose;
+ }
+
+ CrushWrapper newcrush = _get_pending_crush();
+
+ if (!newcrush.rule_exists(name)) {
+ ss << "rule " << name << " does not exist";
+ err = 0;
+ } else {
+ int ruleno = newcrush.get_rule_id(name);
+ ceph_assert(ruleno >= 0);
+
+ // make sure it is not in use.
+ // FIXME: this is ok in some situations, but let's not bother with that
+ // complexity now.
+ if (osdmap.crush_rule_in_use(ruleno)) {
+ ss << "crush rule " << name << " (" << ruleno << ") is in use";
+ err = -EBUSY;
+ goto reply_no_propose;
+ }
+
+ err = newcrush.remove_rule(ruleno);
+ if (err < 0) {
+ goto reply_no_propose;
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ }
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd crush rule rename") {
+ string srcname;
+ string dstname;
+ cmd_getval(cmdmap, "srcname", srcname);
+ cmd_getval(cmdmap, "dstname", dstname);
+ if (srcname.empty() || dstname.empty()) {
+ ss << "must specify both source rule name and destination rule name";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (srcname == dstname) {
+ ss << "destination rule name is equal to source rule name";
+ err = 0;
+ goto reply_no_propose;
+ }
+
+ CrushWrapper newcrush = _get_pending_crush();
+ if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
+ // srcname does not exist and dstname already exists
+ // suppose this is a replay and return success
+ // (so this command is idempotent)
+ ss << "already renamed to '" << dstname << "'";
+ err = 0;
+ goto reply_no_propose;
+ }
+
+ err = newcrush.rename_rule(srcname, dstname, &ss);
+ if (err < 0) {
+ // ss has reason for failure
+ goto reply_no_propose;
+ }
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd setmaxosd") {
+ int64_t newmax;
+ if (!cmd_getval(cmdmap, "newmax", newmax)) {
+ ss << "unable to parse 'newmax' value '"
+ << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ if (newmax > g_conf()->mon_max_osd) {
+ err = -ERANGE;
+ ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
+ << g_conf()->mon_max_osd << ")";
+ goto reply_no_propose;
+ }
+
+ // Don't allow shrinking OSD number as this will cause data loss
+ // and may cause kernel crashes.
+ // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
+ if (newmax < osdmap.get_max_osd()) {
+ // Check if the OSDs exist between current max and new value.
+ // If there are any OSDs exist, then don't allow shrinking number
+ // of OSDs.
+ for (int i = newmax; i < osdmap.get_max_osd(); i++) {
+ if (osdmap.exists(i)) {
+ err = -EBUSY;
+ ss << "cannot shrink max_osd to " << newmax
+ << " because osd." << i << " (and possibly others) still in use";
+ goto reply_no_propose;
+ }
+ }
+ }
+
+ pending_inc.new_max_osd = newmax;
+ ss << "set new max_osd = " << pending_inc.new_max_osd;
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd set-full-ratio" ||
+ prefix == "osd set-backfillfull-ratio" ||
+ prefix == "osd set-nearfull-ratio") {
+ double n;
+ if (!cmd_getval(cmdmap, "ratio", n)) {
+ ss << "unable to parse 'ratio' value '"
+ << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (prefix == "osd set-full-ratio")
+ pending_inc.new_full_ratio = n;
+ else if (prefix == "osd set-backfillfull-ratio")
+ pending_inc.new_backfillfull_ratio = n;
+ else if (prefix == "osd set-nearfull-ratio")
+ pending_inc.new_nearfull_ratio = n;
+ ss << prefix << " " << n;
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd set-require-min-compat-client") {
+ string v;
+ cmd_getval(cmdmap, "version", v);
+ ceph_release_t vno = ceph_release_from_name(v);
+ if (!vno) {
+ ss << "version " << v << " is not recognized";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ OSDMap newmap;
+ newmap.deepish_copy_from(osdmap);
+ newmap.apply_incremental(pending_inc);
+ newmap.require_min_compat_client = vno;
+ auto mvno = newmap.get_min_compat_client();
+ if (vno < mvno) {
+ ss << "osdmap current utilizes features that require " << mvno
+ << "; cannot set require_min_compat_client below that to " << vno;
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) {
+ FeatureMap m;
+ mon.get_combined_feature_map(&m);
+ uint64_t features = ceph_release_features(to_integer<int>(vno));
+ bool first = true;
+ bool ok = true;
+ for (int type : {
+ CEPH_ENTITY_TYPE_CLIENT,
+ CEPH_ENTITY_TYPE_MDS,
+ CEPH_ENTITY_TYPE_MGR }) {
+ auto p = m.m.find(type);
+ if (p == m.m.end()) {
+ continue;
+ }
+ for (auto& q : p->second) {
+ uint64_t missing = ~q.first & features;
+ if (missing) {
+ if (first) {
+ ss << "cannot set require_min_compat_client to " << v << ": ";
+ } else {
+ ss << "; ";
+ }
+ first = false;
+ ss << q.second << " connected " << ceph_entity_type_name(type)
+ << "(s) look like " << ceph_release_name(
+ ceph_release_from_features(q.first))
+ << " (missing 0x" << std::hex << missing << std::dec << ")";
+ ok = false;
+ }
+ }
+ }
+ if (!ok) {
+ ss << "; add --yes-i-really-mean-it to do it anyway";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ }
+ ss << "set require_min_compat_client to " << vno;
+ pending_inc.new_require_min_compat_client = vno;
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd pause") {
+ return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
+
+ } else if (prefix == "osd unpause") {
+ return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
+
+ } else if (prefix == "osd set") {
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+
+ string key;
+ cmd_getval(cmdmap, "key", key);
+ if (key == "pause")
+ return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
+ else if (key == "noup")
+ return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
+ else if (key == "nodown")
+ return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
+ else if (key == "noout")
+ return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
+ else if (key == "noin")
+ return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
+ else if (key == "nobackfill")
+ return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
+ else if (key == "norebalance")
+ return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
+ else if (key == "norecover")
+ return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
+ else if (key == "noscrub")
+ return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
+ else if (key == "nodeep-scrub")
+ return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
+ else if (key == "notieragent")
+ return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
+ else if (key == "nosnaptrim")
+ return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
+ else if (key == "pglog_hardlimit") {
+ if (!osdmap.get_num_up_osds() && !sure) {
+ ss << "Not advisable to continue since no OSDs are up. Pass "
+ << "--yes-i-really-mean-it if you really wish to continue.";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ // The release check here is required because for OSD_PGLOG_HARDLIMIT,
+ // we are reusing a jewel feature bit that was retired in luminous.
+ if (osdmap.require_osd_release >= ceph_release_t::luminous &&
+ (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
+ || sure)) {
+ return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
+ } else {
+ ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ } else if (key == "noautoscale") {
+ return prepare_set_flag(op, CEPH_OSDMAP_NOAUTOSCALE);
+ } else {
+ ss << "unrecognized flag '" << key << "'";
+ err = -EINVAL;
+ }
+
+ } else if (prefix == "osd unset") {
+ string key;
+ cmd_getval(cmdmap, "key", key);
+ if (key == "pause")
+ return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
+ else if (key == "noup")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
+ else if (key == "nodown")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
+ else if (key == "noout")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
+ else if (key == "noin")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
+ else if (key == "nobackfill")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
+ else if (key == "norebalance")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
+ else if (key == "norecover")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
+ else if (key == "noscrub")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
+ else if (key == "nodeep-scrub")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
+ else if (key == "notieragent")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
+ else if (key == "nosnaptrim")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
+ else if (key == "noautoscale")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NOAUTOSCALE);
+ else {
+ ss << "unrecognized flag '" << key << "'";
+ err = -EINVAL;
+ }
+
+ } else if (prefix == "osd require-osd-release") {
+ string release;
+ cmd_getval(cmdmap, "release", release);
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ ceph_release_t rel = ceph_release_from_name(release.c_str());
+ if (!rel) {
+ ss << "unrecognized release " << release;
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (rel == osdmap.require_osd_release) {
+ // idempotent
+ err = 0;
+ goto reply_no_propose;
+ }
+ if (osdmap.require_osd_release < ceph_release_t::quincy && !sure) {
+ ss << "Not advisable to continue since current 'require_osd_release' "
+ << "refers to a very old Ceph release. Pass "
+ << "--yes-i-really-mean-it if you really wish to continue.";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ if (!osdmap.get_num_up_osds() && !sure) {
+ ss << "Not advisable to continue since no OSDs are up. Pass "
+ << "--yes-i-really-mean-it if you really wish to continue.";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ if (rel == ceph_release_t::quincy) {
+ if (!mon.monmap->get_required_features().contains_all(
+ ceph::features::mon::FEATURE_QUINCY)) {
+ ss << "not all mons are quincy";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_QUINCY))
+ && !sure) {
+ ss << "not all up OSDs have CEPH_FEATURE_SERVER_QUINCY feature";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ } else if (rel == ceph_release_t::reef) {
+ if (!mon.monmap->get_required_features().contains_all(
+ ceph::features::mon::FEATURE_REEF)) {
+ ss << "not all mons are reef";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_REEF))
+ && !sure) {
+ ss << "not all up OSDs have CEPH_FEATURE_SERVER_REEF feature";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ } else if (rel == ceph_release_t::squid) {
+ if (!mon.monmap->get_required_features().contains_all(
+ ceph::features::mon::FEATURE_SQUID)) {
+ ss << "not all mons are squid";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_SQUID))
+ && !sure) {
+ ss << "not all up OSDs have CEPH_FEATURE_SERVER_SQUID feature";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ } else {
+ ss << "not supported for this release";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ if (rel < osdmap.require_osd_release) {
+ ss << "require_osd_release cannot be lowered once it has been set";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ pending_inc.new_require_osd_release = rel;
+ goto update;
+ } else if (prefix == "osd down" ||
+ prefix == "osd out" ||
+ prefix == "osd in" ||
+ prefix == "osd rm" ||
+ prefix == "osd stop") {
+
+ bool any = false;
+ bool stop = false;
+ bool verbose = true;
+ bool definitely_dead = false;
+
+ vector<string> idvec;
+ cmd_getval(cmdmap, "ids", idvec);
+ cmd_getval(cmdmap, "definitely_dead", definitely_dead);
+ derr << "definitely_dead " << (int)definitely_dead << dendl;
+ for (unsigned j = 0; j < idvec.size() && !stop; j++) {
+ set<int> osds;
+
+ // wildcard?
+ if (j == 0 &&
+ (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
+ if (prefix == "osd in") {
+ // touch out osds only
+ osdmap.get_out_existing_osds(osds);
+ } else {
+ osdmap.get_all_osds(osds);
+ }
+ stop = true;
+ verbose = false; // so the output is less noisy.
+ } else {
+ long osd = parse_osd_id(idvec[j].c_str(), &ss);
+ if (osd < 0) {
+ ss << "invalid osd id" << osd;
+ err = -EINVAL;
+ continue;
+ } else if (!osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist. ";
+ continue;
+ }
+
+ osds.insert(osd);
+ }
+
+ for (auto &osd : osds) {
+ if (prefix == "osd down") {
+ if (osdmap.is_down(osd)) {
+ if (verbose)
+ ss << "osd." << osd << " is already down. ";
+ } else {
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
+ ss << "marked down osd." << osd << ". ";
+ any = true;
+ }
+ if (definitely_dead) {
+ if (!pending_inc.new_xinfo.count(osd)) {
+ pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
+ }
+ if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
+ any = true;
+ }
+ pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
+ }
+ } else if (prefix == "osd out") {
+ if (osdmap.is_out(osd)) {
+ if (verbose)
+ ss << "osd." << osd << " is already out. ";
+ } else {
+ pending_inc.new_weight[osd] = CEPH_OSD_OUT;
+ if (osdmap.osd_weight[osd]) {
+ if (pending_inc.new_xinfo.count(osd) == 0) {
+ pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
+ }
+ pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
+ }
+ ss << "marked out osd." << osd << ". ";
+ std::ostringstream msg;
+ msg << "Client " << op->get_session()->entity_name
+ << " marked osd." << osd << " out";
+ if (osdmap.is_up(osd)) {
+ msg << ", while it was still marked up";
+ } else {
+ auto period = ceph_clock_now() - down_pending_out[osd];
+ msg << ", after it was down for " << int(period.sec())
+ << " seconds";
+ }
+
+ mon.clog->info() << msg.str();
+ any = true;
+ }
+ } else if (prefix == "osd in") {
+ if (osdmap.is_in(osd)) {
+ if (verbose)
+ ss << "osd." << osd << " is already in. ";
+ } else {
+ if (osdmap.osd_xinfo[osd].old_weight > 0) {
+ pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
+ if (pending_inc.new_xinfo.count(osd) == 0) {
+ pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
+ }
+ pending_inc.new_xinfo[osd].old_weight = 0;
+ } else {
+ pending_inc.new_weight[osd] = CEPH_OSD_IN;
+ }
+ ss << "marked in osd." << osd << ". ";
+ any = true;
+ }
+ } else if (prefix == "osd rm") {
+ err = prepare_command_osd_remove(osd);
+
+ if (err == -EBUSY) {
+ if (any)
+ ss << ", ";
+ ss << "osd." << osd << " is still up; must be down before removal. ";
+ } else {
+ ceph_assert(err == 0);
+ if (any) {
+ ss << ", osd." << osd;
+ } else {
+ ss << "removed osd." << osd;
+ }
+ any = true;
+ }
+ } else if (prefix == "osd stop") {
+ if (osdmap.is_stop(osd)) {
+ if (verbose)
+ ss << "osd." << osd << " is already stopped. ";
+ } else if (osdmap.is_down(osd)) {
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
+ ss << "stop down osd." << osd << ". ";
+ any = true;
+ } else {
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
+ ss << "stop osd." << osd << ". ";
+ any = true;
+ }
+ }
+ }
+ }
+ if (any) {
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, err, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+ } else if (prefix == "osd set-group" ||
+ prefix == "osd unset-group" ||
+ prefix == "osd add-noup" ||
+ prefix == "osd add-nodown" ||
+ prefix == "osd add-noin" ||
+ prefix == "osd add-noout" ||
+ prefix == "osd rm-noup" ||
+ prefix == "osd rm-nodown" ||
+ prefix == "osd rm-noin" ||
+ prefix == "osd rm-noout") {
+ bool do_set = prefix == "osd set-group" ||
+ prefix.find("add") != string::npos;
+ string flag_str;
+ unsigned flags = 0;
+ vector<string> who;
+ if (prefix == "osd set-group" || prefix == "osd unset-group") {
+ cmd_getval(cmdmap, "flags", flag_str);
+ cmd_getval(cmdmap, "who", who);
+ vector<string> raw_flags;
+ boost::split(raw_flags, flag_str, boost::is_any_of(","));
+ for (auto& f : raw_flags) {
+ if (f == "noup")
+ flags |= CEPH_OSD_NOUP;
+ else if (f == "nodown")
+ flags |= CEPH_OSD_NODOWN;
+ else if (f == "noin")
+ flags |= CEPH_OSD_NOIN;
+ else if (f == "noout")
+ flags |= CEPH_OSD_NOOUT;
+ else {
+ ss << "unrecognized flag '" << f << "', must be one of "
+ << "{noup,nodown,noin,noout}";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ }
+ } else {
+ cmd_getval(cmdmap, "ids", who);
+ if (prefix.find("noup") != string::npos)
+ flags = CEPH_OSD_NOUP;
+ else if (prefix.find("nodown") != string::npos)
+ flags = CEPH_OSD_NODOWN;
+ else if (prefix.find("noin") != string::npos)
+ flags = CEPH_OSD_NOIN;
+ else if (prefix.find("noout") != string::npos)
+ flags = CEPH_OSD_NOOUT;
+ else
+ ceph_assert(0 == "Unreachable!");
+ }
+ if (flags == 0) {
+ ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (who.empty()) {
+ ss << "must specify at least one or more targets to set/unset";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ set<int> osds;
+ set<int> crush_nodes;
+ set<int> device_classes;
+ for (auto& w : who) {
+ if (w == "any" || w == "all" || w == "*") {
+ osdmap.get_all_osds(osds);
+ break;
+ }
+ std::stringstream ts;
+ if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
+ osds.insert(osd);
+ } else if (osdmap.crush->name_exists(w)) {
+ crush_nodes.insert(osdmap.crush->get_item_id(w));
+ } else if (osdmap.crush->class_exists(w)) {
+ device_classes.insert(osdmap.crush->get_class_id(w));
+ } else {
+ ss << "unable to parse osd id or crush node or device class: "
+ << "\"" << w << "\". ";
+ }
+ }
+ if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
+ // ss has reason for failure
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ bool any = false;
+ for (auto osd : osds) {
+ if (!osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist. ";
+ continue;
+ }
+ if (do_set) {
+ if (flags & CEPH_OSD_NOUP) {
+ any |= osdmap.is_noup_by_osd(osd) ?
+ pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
+ }
+ if (flags & CEPH_OSD_NODOWN) {
+ any |= osdmap.is_nodown_by_osd(osd) ?
+ pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
+ }
+ if (flags & CEPH_OSD_NOIN) {
+ any |= osdmap.is_noin_by_osd(osd) ?
+ pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
+ }
+ if (flags & CEPH_OSD_NOOUT) {
+ any |= osdmap.is_noout_by_osd(osd) ?
+ pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
+ }
+ } else {
+ if (flags & CEPH_OSD_NOUP) {
+ any |= osdmap.is_noup_by_osd(osd) ?
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
+ pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
+ }
+ if (flags & CEPH_OSD_NODOWN) {
+ any |= osdmap.is_nodown_by_osd(osd) ?
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
+ pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
+ }
+ if (flags & CEPH_OSD_NOIN) {
+ any |= osdmap.is_noin_by_osd(osd) ?
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
+ pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
+ }
+ if (flags & CEPH_OSD_NOOUT) {
+ any |= osdmap.is_noout_by_osd(osd) ?
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
+ pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
+ }
+ }
+ }
+ for (auto& id : crush_nodes) {
+ auto old_flags = osdmap.get_crush_node_flags(id);
+ auto& pending_flags = pending_inc.new_crush_node_flags[id];
+ pending_flags |= old_flags; // adopt existing flags first!
+ if (do_set) {
+ pending_flags |= flags;
+ } else {
+ pending_flags &= ~flags;
+ }
+ any = true;
+ }
+ for (auto& id : device_classes) {
+ auto old_flags = osdmap.get_device_class_flags(id);
+ auto& pending_flags = pending_inc.new_device_class_flags[id];
+ pending_flags |= old_flags;
+ if (do_set) {
+ pending_flags |= flags;
+ } else {
+ pending_flags &= ~flags;
+ }
+ any = true;
+ }
+ if (any) {
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, err, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+ } else if (prefix == "osd pg-temp") {
+ pg_t pgid;
+ err = parse_pgid(cmdmap, ss, pgid);
+ if (err < 0)
+ goto reply_no_propose;
+ if (pending_inc.new_pg_temp.count(pgid)) {
+ dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
+ goto wait;
+ }
+
+ vector<int64_t> id_vec;
+ vector<int32_t> new_pg_temp;
+ cmd_getval(cmdmap, "id", id_vec);
+ if (id_vec.empty()) {
+ pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
+ ss << "done cleaning up pg_temp of " << pgid;
+ goto update;
+ }
+ for (auto osd : id_vec) {
+ if (!osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ new_pg_temp.push_back(osd);
+ }
+
+ int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
+ if ((int)new_pg_temp.size() < pool_min_size) {
+ ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
+ << pool_min_size << ")";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ int pool_size = osdmap.get_pg_pool_size(pgid);
+ if ((int)new_pg_temp.size() > pool_size) {
+ ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
+ << pool_size << ")";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
+ new_pg_temp.begin(), new_pg_temp.end());
+ ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
+ goto update;
+ } else if (prefix == "osd primary-temp" ||
+ prefix == "osd rm-primary-temp") {
+ pg_t pgid;
+ err = parse_pgid(cmdmap, ss, pgid);
+ if (err < 0)
+ goto reply_no_propose;
+
+ int64_t osd;
+ if (prefix == "osd primary-temp") {
+ if (!cmd_getval(cmdmap, "id", osd)) {
+ ss << "unable to parse 'id' value '"
+ << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (!osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ }
+ else if (prefix == "osd rm-primary-temp") {
+ osd = -1;
+ }
+ else {
+ ceph_assert(0 == "Unreachable!");
+ }
+
+ if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
+ osdmap.require_min_compat_client < ceph_release_t::firefly) {
+ ss << "require_min_compat_client "
+ << osdmap.require_min_compat_client
+ << " < firefly, which is required for primary-temp";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+
+ pending_inc.new_primary_temp[pgid] = osd;
+ ss << "set " << pgid << " primary_temp mapping to " << osd;
+ goto update;
+ } else if (prefix == "pg repeer") {
+ pg_t pgid;
+ err = parse_pgid(cmdmap, ss, pgid);
+ if (err < 0)
+ goto reply_no_propose;
+ vector<int> acting;
+ int primary;
+ osdmap.pg_to_acting_osds(pgid, &acting, &primary);
+ if (primary < 0) {
+ err = -EAGAIN;
+ ss << "pg currently has no primary";
+ goto reply_no_propose;
+ }
+ if (acting.size() > 1) {
+ // map to just primary; it will map back to what it wants
+ pending_inc.new_pg_temp[pgid] = { primary };
+ } else {
+ // hmm, pick another arbitrary osd to induce a change. Note
+ // that this won't work if there is only one suitable OSD in the cluster.
+ int i;
+ bool done = false;
+ for (i = 0; i < osdmap.get_max_osd(); ++i) {
+ if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
+ continue;
+ }
+ pending_inc.new_pg_temp[pgid] = { primary, i };
+ done = true;
+ break;
+ }
+ if (!done) {
+ err = -EAGAIN;
+ ss << "not enough up OSDs in the cluster to force repeer";
+ goto reply_no_propose;
+ }
+ }
+ goto update;
+ } else if (prefix == "osd pg-upmap" ||
+ prefix == "osd rm-pg-upmap" ||
+ prefix == "osd pg-upmap-items" ||
+ prefix == "osd rm-pg-upmap-items" ||
+ prefix == "osd pg-upmap-primary" ||
+ prefix == "osd rm-pg-upmap-primary") {
+ enum {
+ OP_PG_UPMAP,
+ OP_RM_PG_UPMAP,
+ OP_PG_UPMAP_ITEMS,
+ OP_RM_PG_UPMAP_ITEMS,
+ OP_PG_UPMAP_PRIMARY,
+ OP_RM_PG_UPMAP_PRIMARY,
+ } upmap_option;
+
+ if (prefix == "osd pg-upmap") {
+ upmap_option = OP_PG_UPMAP;
+ } else if (prefix == "osd rm-pg-upmap") {
+ upmap_option = OP_RM_PG_UPMAP;
+ } else if (prefix == "osd pg-upmap-items") {
+ upmap_option = OP_PG_UPMAP_ITEMS;
+ } else if (prefix == "osd rm-pg-upmap-items") {
+ upmap_option = OP_RM_PG_UPMAP_ITEMS;
+ } else if (prefix == "osd pg-upmap-primary") {
+ upmap_option = OP_PG_UPMAP_PRIMARY;
+ } else if (prefix == "osd rm-pg-upmap-primary") {
+ upmap_option = OP_RM_PG_UPMAP_PRIMARY;
+ } else {
+ ceph_abort_msg("invalid upmap option");
+ }
+
+ ceph_release_t min_release = ceph_release_t::unknown;
+ string feature_name = "unknown";
+ uint64_t min_feature = CEPH_FEATURES_ALL; // paranoia
+ switch (upmap_option) {
+ case OP_PG_UPMAP: // fall through
+ case OP_RM_PG_UPMAP: // fall through
+ case OP_PG_UPMAP_ITEMS: // fall through
+ case OP_RM_PG_UPMAP_ITEMS:
+ min_release = ceph_release_t::luminous;
+ min_feature = CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
+ feature_name = "pg-upmap";
+ break;
+
+ case OP_PG_UPMAP_PRIMARY: // fall through
+ case OP_RM_PG_UPMAP_PRIMARY:
+ min_release = ceph_release_t::reef;
+ min_feature = CEPH_FEATUREMASK_SERVER_REEF;
+ feature_name = "pg-upmap-primary";
+ break;
+
+ default:
+ ceph_abort_msg("invalid upmap option");
+ }
+ string min_release_name = ceph_release_name(static_cast<int>(min_release));
+
+ if (osdmap.require_min_compat_client < min_release) {
+ ss << "min_compat_client "
+ << osdmap.require_min_compat_client
+ << " < " << min_release_name << ", which is required for " << feature_name << ". "
+ << "Try 'ceph osd set-require-min-compat-client " << min_release_name << "' "
+ << "before using the new interface";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+
+ err = check_cluster_features(min_feature, ss);
+ if (err == -EAGAIN)
+ goto wait;
+ if (err < 0)
+ goto reply_no_propose;
+ pg_t pgid;
+ err = parse_pgid(cmdmap, ss, pgid);
+ if (err < 0)
+ goto reply_no_propose;
+ if (pending_inc.old_pools.count(pgid.pool())) {
+ ss << "pool of " << pgid << " is pending removal";
+ err = -ENOENT;
+ getline(ss, rs);
+ wait_for_commit(op,
+ new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
+ return true;
+ }
+
+ // check pending upmap changes
+ switch (upmap_option) {
+ case OP_PG_UPMAP: // fall through
+ case OP_RM_PG_UPMAP:
+ if (pending_inc.new_pg_upmap.count(pgid) ||
+ pending_inc.old_pg_upmap.count(pgid)) {
+ dout(10) << __func__ << " waiting for pending update on "
+ << pgid << dendl;
+ goto wait;
+ }
+ break;
+
+ case OP_PG_UPMAP_PRIMARY: // fall through
+ case OP_RM_PG_UPMAP_PRIMARY:
+ {
+ const pg_pool_t *pt = osdmap.get_pg_pool(pgid.pool());
+ if (! pt->is_replicated()) {
+ ss << "pg-upmap-primary is only supported for replicated pools";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ }
+ // fall through
+ case OP_PG_UPMAP_ITEMS: // fall through
+ case OP_RM_PG_UPMAP_ITEMS: // fall through
+ if (pending_inc.new_pg_upmap_items.count(pgid) ||
+ pending_inc.old_pg_upmap_items.count(pgid)) {
+ dout(10) << __func__ << " waiting for pending update on "
+ << pgid << dendl;
+ goto wait;
+ }
+ break;
+
+ default:
+ ceph_abort_msg("invalid upmap option");
+ }
+
+ switch (upmap_option) {
+ case OP_PG_UPMAP:
+ {
+ vector<int64_t> id_vec;
+ if (!cmd_getval(cmdmap, "id", id_vec)) {
+ ss << "unable to parse 'id' value(s) '"
+ << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
+ if ((int)id_vec.size() < pool_min_size) {
+ ss << "num of osds (" << id_vec.size() <<") < pool min size ("
+ << pool_min_size << ")";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ int pool_size = osdmap.get_pg_pool_size(pgid);
+ if ((int)id_vec.size() > pool_size) {
+ ss << "num of osds (" << id_vec.size() <<") > pool size ("
+ << pool_size << ")";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ vector<int32_t> new_pg_upmap;
+ for (auto osd : id_vec) {
+ if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
+ if (it != new_pg_upmap.end()) {
+ ss << "osd." << osd << " already exists, ";
+ continue;
+ }
+ new_pg_upmap.push_back(osd);
+ }
+
+ if (new_pg_upmap.empty()) {
+ ss << "no valid upmap items(pairs) is specified";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
+ new_pg_upmap.begin(), new_pg_upmap.end());
+ ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
+ }
+ break;
+
+ case OP_RM_PG_UPMAP:
+ {
+ pending_inc.old_pg_upmap.insert(pgid);
+ ss << "clear " << pgid << " pg_upmap mapping";
+ }
+ break;
+
+ case OP_PG_UPMAP_ITEMS:
+ {
+ vector<int64_t> id_vec;
+ if (!cmd_getval(cmdmap, "id", id_vec)) {
+ ss << "unable to parse 'id' value(s) '"
+ << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ if (id_vec.size() % 2) {
+ ss << "you must specify pairs of osd ids to be remapped";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ int pool_size = osdmap.get_pg_pool_size(pgid);
+ if ((int)(id_vec.size() / 2) > pool_size) {
+ ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
+ << pool_size << ")";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+ ostringstream items;
+ items << "[";
+ for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
+ int from = *p++;
+ int to = *p;
+ if (from == to) {
+ ss << "from osd." << from << " == to osd." << to << ", ";
+ continue;
+ }
+ if (!osdmap.exists(from)) {
+ ss << "osd." << from << " does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
+ ss << "osd." << to << " does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ pair<int32_t,int32_t> entry = make_pair(from, to);
+ auto it = std::find(new_pg_upmap_items.begin(),
+ new_pg_upmap_items.end(), entry);
+ if (it != new_pg_upmap_items.end()) {
+ ss << "osd." << from << " -> osd." << to << " already exists, ";
+ continue;
+ }
+ new_pg_upmap_items.push_back(entry);
+ items << from << "->" << to << ",";
+ }
+ string out(items.str());
+ out.resize(out.size() - 1); // drop last ','
+ out += "]";
+
+ if (new_pg_upmap_items.empty()) {
+ ss << "no valid upmap items(pairs) is specified";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ pending_inc.new_pg_upmap_items[pgid] =
+ mempool::osdmap::vector<pair<int32_t,int32_t>>(
+ new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+ ss << "set " << pgid << " pg_upmap_items mapping to " << out;
+ }
+ break;
+
+ case OP_RM_PG_UPMAP_ITEMS:
+ {
+ pending_inc.old_pg_upmap_items.insert(pgid);
+ ss << "clear " << pgid << " pg_upmap_items mapping";
+ }
+ break;
+
+ case OP_PG_UPMAP_PRIMARY:
+ {
+ int64_t id;
+ if (!cmd_getval(cmdmap, "id", id)) {
+ ss << "invalid osd id value '"
+ << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (id != CRUSH_ITEM_NONE && !osdmap.exists(id)) {
+ ss << "osd." << id << " does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ vector<int> acting;
+ int primary;
+ osdmap.pg_to_acting_osds(pgid, &acting, &primary);
+ if (id == primary) {
+ ss << "osd." << id << " is already primary for pg " << pgid;
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ int found_idx = 0;
+ for (int i = 1 ; i < (int)acting.size(); i++) { // skip 0 on purpose
+ if (acting[i] == id) {
+ found_idx = i;
+ break;
+ }
+ }
+ if (found_idx == 0) {
+ ss << "osd." << id << " is not in acting set for pg " << pgid;
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ vector<int> new_acting(acting);
+ new_acting[found_idx] = new_acting[0];
+ new_acting[0] = id;
+ int pool_size = osdmap.get_pg_pool_size(pgid);
+ if (osdmap.crush->verify_upmap(cct, osdmap.get_pg_pool_crush_rule(pgid),
+ pool_size, new_acting) >= 0) {
+ ss << "change primary for pg " << pgid << " to osd." << id;
+ }
+ else {
+ ss << "can't change primary for pg " << pgid << " to osd." << id
+ << " - illegal pg after the change";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ pending_inc.new_pg_upmap_primary[pgid] = id;
+ //TO-REMOVE:
+ ldout(cct, 20) << "pg " << pgid << ": set pg_upmap_primary to " << id << dendl;
+ }
+ break;
+
+ case OP_RM_PG_UPMAP_PRIMARY:
+ {
+ pending_inc.old_pg_upmap_primary.insert(pgid);
+ ss << "clear " << pgid << " pg_upmap_primary mapping";
+ }
+ break;
+
+ default:
+ ceph_abort_msg("invalid upmap option");
+ }
+
+ goto update;
+ } else if (prefix == "osd primary-affinity") {
+ int64_t id;
+ if (!cmd_getval(cmdmap, "id", id)) {
+ ss << "invalid osd id value '"
+ << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ double w;
+ if (!cmd_getval(cmdmap, "weight", w)) {
+ ss << "unable to parse 'weight' value '"
+ << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
+ if (ww < 0L) {
+ ss << "weight must be >= 0";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
+ osdmap.require_min_compat_client < ceph_release_t::firefly) {
+ ss << "require_min_compat_client "
+ << osdmap.require_min_compat_client
+ << " < firefly, which is required for primary-affinity";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ if (osdmap.exists(id)) {
+ pending_inc.new_primary_affinity[id] = ww;
+ ss << "set osd." << id << " primary-affinity to " << w << " (" << std::ios::hex << ww << std::ios::dec << ")";
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else {
+ ss << "osd." << id << " does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ } else if (prefix == "osd reweight") {
+ int64_t id;
+ if (!cmd_getval(cmdmap, "id", id)) {
+ ss << "unable to parse osd id value '"
+ << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ double w;
+ if (!cmd_getval(cmdmap, "weight", w)) {
+ ss << "unable to parse weight value '"
+ << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ long ww = (int)((double)CEPH_OSD_IN*w);
+ if (ww < 0L) {
+ ss << "weight must be >= 0";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (osdmap.exists(id)) {
+ pending_inc.new_weight[id] = ww;
+ ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else {
+ ss << "osd." << id << " does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ } else if (prefix == "osd reweightn") {
+ map<int32_t, uint32_t> weights;
+ err = parse_reweights(cct, cmdmap, osdmap, &weights);
+ if (err) {
+ ss << "unable to parse 'weights' value '"
+ << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
+ goto reply_no_propose;
+ }
+ pending_inc.new_weight.insert(weights.begin(), weights.end());
+ wait_for_commit(
+ op,
+ new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd lost") {
+ int64_t id;
+ if (!cmd_getval(cmdmap, "id", id)) {
+ ss << "unable to parse osd id value '"
+ << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) {
+ ss << "are you SURE? this might mean real, permanent data loss. pass "
+ "--yes-i-really-mean-it if you really do.";
+ err = -EPERM;
+ goto reply_no_propose;
+ } else if (!osdmap.exists(id)) {
+ ss << "osd." << id << " does not exist";
+ err = -ENOENT;
+ goto reply_no_propose;
+ } else if (!osdmap.is_down(id)) {
+ ss << "osd." << id << " is not down";
+ err = -EBUSY;
+ goto reply_no_propose;
+ } else {
+ epoch_t e = osdmap.get_info(id).down_at;
+ pending_inc.new_lost[id] = e;
+ ss << "marked osd lost in epoch " << e;
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+
+ } else if (prefix == "osd destroy-actual" ||
+ prefix == "osd purge-actual" ||
+ prefix == "osd purge-new") {
+ /* Destroying an OSD means that we don't expect to further make use of
+ * the OSDs data (which may even become unreadable after this operation),
+ * and that we are okay with scrubbing all its cephx keys and config-key
+ * data (which may include lockbox keys, thus rendering the osd's data
+ * unreadable).
+ *
+ * The OSD will not be removed. Instead, we will mark it as destroyed,
+ * such that a subsequent call to `create` will not reuse the osd id.
+ * This will play into being able to recreate the OSD, at the same
+ * crush location, with minimal data movement.
+ */
+
+ // make sure authmon is writeable.
+ if (!mon.authmon()->is_writeable()) {
+ dout(10) << __func__ << " waiting for auth mon to be writeable for "
+ << "osd destroy" << dendl;
+ mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return false;
+ }
+
+ int64_t id;
+ if (!cmd_getval(cmdmap, "id", id)) {
+ auto p = cmdmap.find("id");
+ if (p == cmdmap.end()) {
+ ss << "no osd id specified";
+ } else {
+ ss << "unable to parse osd id value '"
+ << cmd_vartype_stringify(cmdmap.at("id")) << "";
+ }
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ bool is_destroy = (prefix == "osd destroy-actual");
+ if (!is_destroy) {
+ ceph_assert("osd purge-actual" == prefix ||
+ "osd purge-new" == prefix);
+ }
+
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) {
+ ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
+ << "This will mean real, permanent data loss, as well "
+ << "as deletion of cephx and lockbox keys. "
+ << "Pass --yes-i-really-mean-it if you really do.";
+ err = -EPERM;
+ goto reply_no_propose;
+ } else if (!osdmap.exists(id)) {
+ ss << "osd." << id << " does not exist";
+ err = 0; // idempotent
+ goto reply_no_propose;
+ } else if (osdmap.is_up(id)) {
+ ss << "osd." << id << " is not `down`.";
+ err = -EBUSY;
+ goto reply_no_propose;
+ } else if (is_destroy && osdmap.is_destroyed(id)) {
+ ss << "destroyed osd." << id;
+ err = 0;
+ goto reply_no_propose;
+ }
+
+ if (prefix == "osd purge-new" &&
+ (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
+ ss << "osd." << id << " is not new";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+
+ bool goto_reply = false;
+
+ paxos.plug();
+ if (is_destroy) {
+ err = prepare_command_osd_destroy(op, id, ss);
+ if (err == EAGAIN) {
+ return false;
+ }
+ // we checked above that it should exist.
+ ceph_assert(err != -ENOENT);
+ } else {
+ err = prepare_command_osd_purge(op, id, ss);
+ if (err == EAGAIN) {
+ return false;
+ }
+ if (err == -ENOENT) {
+ err = 0;
+ ss << "osd." << id << " does not exist.";
+ goto_reply = true;
+ }
+ }
+ paxos.unplug();
+
+ if (err < 0 || goto_reply) {
+ goto reply_no_propose;
+ }
+
+ if (is_destroy) {
+ ss << "destroyed osd." << id;
+ } else {
+ ss << "purged osd." << id;
+ }
+
+ getline(ss, rs);
+ wait_for_commit(op,
+ new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
+ force_immediate_propose();
+ return true;
+
+ } else if (prefix == "osd new") {
+
+ // make sure authmon is writeable.
+ if (!mon.authmon()->is_writeable()) {
+ dout(10) << __func__ << " waiting for auth mon to be writeable for "
+ << "osd new" << dendl;
+ mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return false;
+ }
+
+ // make sure kvmon is writeable.
+ if (!mon.kvmon()->is_writeable()) {
+ dout(10) << __func__ << " waiting for kv mon to be writeable for "
+ << "osd new" << dendl;
+ mon.kvmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return false;
+ }
+
+ map<string,string> param_map;
+
+ bufferlist bl = m->get_data();
+ string param_json = bl.to_str();
+ dout(20) << __func__ << " osd new json = " << param_json << dendl;
+
+ err = get_json_str_map(param_json, ss, &param_map);
+ if (err < 0)
+ goto reply_no_propose;
+
+ dout(20) << __func__ << " osd new params " << param_map << dendl;
+
+ paxos.plug();
+ err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
+ paxos.unplug();
+
+ if (err < 0) {
+ goto reply_no_propose;
+ }
+
+ if (f) {
+ f->flush(rdata);
+ } else {
+ rdata.append(ss);
+ }
+
+ if (err == EEXIST) {
+ // idempotent operation
+ err = 0;
+ goto reply_no_propose;
+ }
+
+ wait_for_commit(op,
+ new Monitor::C_Command(mon, op, 0, rs, rdata,
+ get_last_committed() + 1));
+ force_immediate_propose();
+ return true;
+
+ } else if (prefix == "osd create") {
+
+ // optional id provided?
+ int64_t id = -1, cmd_id = -1;
+ if (cmd_getval(cmdmap, "id", cmd_id)) {
+ if (cmd_id < 0) {
+ ss << "invalid osd id value '" << cmd_id << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ dout(10) << " osd create got id " << cmd_id << dendl;
+ }
+
+ uuid_d uuid;
+ string uuidstr;
+ if (cmd_getval(cmdmap, "uuid", uuidstr)) {
+ if (!uuid.parse(uuidstr.c_str())) {
+ ss << "invalid uuid value '" << uuidstr << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ // we only care about the id if we also have the uuid, to
+ // ensure the operation's idempotency.
+ id = cmd_id;
+ }
+
+ int32_t new_id = -1;
+ err = prepare_command_osd_create(id, uuid, &new_id, ss);
+ if (err < 0) {
+ if (err == -EAGAIN) {
+ goto wait;
+ }
+ // a check has failed; reply to the user.
+ goto reply_no_propose;
+
+ } else if (err == EEXIST) {
+ // this is an idempotent operation; we can go ahead and reply.
+ if (f) {
+ f->open_object_section("created_osd");
+ f->dump_int("osdid", new_id);
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ss << new_id;
+ rdata.append(ss);
+ }
+ err = 0;
+ goto reply_no_propose;
+ }
+
+ string empty_device_class;
+ do_osd_create(id, uuid, empty_device_class, &new_id);
+
+ if (f) {
+ f->open_object_section("created_osd");
+ f->dump_int("osdid", new_id);
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ss << new_id;
+ rdata.append(ss);
+ }
+ wait_for_commit(op,
+ new Monitor::C_Command(mon, op, 0, rs, rdata,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd blocklist clear" ||
+ prefix == "osd blacklist clear") {
+ pending_inc.new_blocklist.clear();
+ std::list<std::pair<entity_addr_t,utime_t > > blocklist;
+ std::list<std::pair<entity_addr_t,utime_t > > range_b;
+ osdmap.get_blocklist(&blocklist, &range_b);
+ for (const auto &entry : blocklist) {
+ pending_inc.old_blocklist.push_back(entry.first);
+ }
+ for (const auto &entry : range_b) {
+ pending_inc.old_range_blocklist.push_back(entry.first);
+ }
+ ss << " removed all blocklist entries";
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd blocklist" ||
+ prefix == "osd blacklist") {
+ string addrstr, rangestr;
+ bool range = false;
+ cmd_getval(cmdmap, "addr", addrstr);
+ if (cmd_getval(cmdmap, "range", rangestr)) {
+ if (rangestr == "range") {
+ range = true;
+ } else {
+ ss << "Did you mean to specify \"osd blocklist range\"?";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ }
+ entity_addr_t addr;
+ if (!addr.parse(addrstr)) {
+ ss << "unable to parse address " << addrstr;
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ else {
+ if (range) {
+ if (!addr.maybe_cidr()) {
+ ss << "You specified a range command, but " << addr
+ << " does not parse as a CIDR range";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ addr.type = entity_addr_t::TYPE_CIDR;
+ err = check_cluster_features(CEPH_FEATUREMASK_RANGE_BLOCKLIST, ss);
+ if (err) {
+ goto reply_no_propose;
+ }
+ if ((addr.is_ipv4() && addr.get_nonce() > 32) ||
+ (addr.is_ipv6() && addr.get_nonce() > 128)) {
+ ss << "Too many bits in range for that protocol!";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ } else {
+ if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
+ // always blocklist type ANY
+ addr.set_type(entity_addr_t::TYPE_ANY);
+ } else {
+ addr.set_type(entity_addr_t::TYPE_LEGACY);
+ }
+ }
+
+ string blocklistop;
+ if (!cmd_getval(cmdmap, "blocklistop", blocklistop)) {
+ cmd_getval(cmdmap, "blacklistop", blocklistop);
+ }
+ if (blocklistop == "add") {
+ utime_t expires = ceph_clock_now();
+ // default one hour
+ double d = cmd_getval_or<double>(cmdmap, "expire",
+ g_conf()->mon_osd_blocklist_default_expire);
+ expires += d;
+
+ auto add_to_pending_blocklists = [](auto& nb, auto& ob,
+ const auto& addr,
+ const auto& expires) {
+ nb[addr] = expires;
+ // cancel any pending un-blocklisting request too
+ auto it = std::find(ob.begin(),
+ ob.end(), addr);
+ if (it != ob.end()) {
+ ob.erase(it);
+ }
+ };
+ if (range) {
+ add_to_pending_blocklists(pending_inc.new_range_blocklist,
+ pending_inc.old_range_blocklist,
+ addr, expires);
+
+ } else {
+ add_to_pending_blocklists(pending_inc.new_blocklist,
+ pending_inc.old_blocklist,
+ addr, expires);
+ }
+
+ ss << "blocklisting " << addr << " until " << expires << " (" << d << " sec)";
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (blocklistop == "rm") {
+ auto rm_from_pending_blocklists = [](const auto& addr,
+ auto& blocklist,
+ auto& ob, auto& pb) {
+ if (blocklist.count(addr)) {
+ ob.push_back(addr);
+ return true;
+ } else if (pb.count(addr)) {
+ pb.erase(addr);
+ return true;
+ }
+ return false;
+ };
+ if ((!range && rm_from_pending_blocklists(addr, osdmap.blocklist,
+ pending_inc.old_blocklist,
+ pending_inc.new_blocklist)) ||
+ (range && rm_from_pending_blocklists(addr, osdmap.range_blocklist,
+ pending_inc.old_range_blocklist,
+ pending_inc.new_range_blocklist))) {
+ ss << "un-blocklisting " << addr;
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+ ss << addr << " isn't blocklisted";
+ err = 0;
+ goto reply_no_propose;
+ }
+ }
+ } else if (prefix == "osd pool mksnap") {
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+ if (pool < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ string snapname;
+ cmd_getval(cmdmap, "snap", snapname);
+ const pg_pool_t *p = osdmap.get_pg_pool(pool);
+ if (p->is_unmanaged_snaps_mode()) {
+ ss << "pool " << poolstr << " is in unmanaged snaps mode";
+ err = -EINVAL;
+ goto reply_no_propose;
+ } else if (p->snap_exists(snapname.c_str())) {
+ ss << "pool " << poolstr << " snap " << snapname << " already exists";
+ err = 0;
+ goto reply_no_propose;
+ } else if (p->is_tier()) {
+ ss << "pool " << poolstr << " is a cache tier";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ pg_pool_t *pp = 0;
+ if (pending_inc.new_pools.count(pool))
+ pp = &pending_inc.new_pools[pool];
+ if (!pp) {
+ pp = &pending_inc.new_pools[pool];
+ *pp = *p;
+ }
+ if (pp->snap_exists(snapname.c_str())) {
+ ss << "pool " << poolstr << " snap " << snapname << " already exists";
+ } else {
+ if (const auto& fsmap = mon.mdsmon()->get_fsmap(); fsmap.pool_in_use(pool)) {
+ dout(20) << "pool-level snapshots have been disabled for pools "
+ "attached to an fs - poolid:" << pool << dendl;
+ err = -EOPNOTSUPP;
+ goto reply_no_propose;
+ }
+ pp->add_snap(snapname.c_str(), ceph_clock_now());
+ pp->set_snap_epoch(pending_inc.epoch);
+ ss << "created pool " << poolstr << " snap " << snapname;
+ }
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd pool rmsnap") {
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+ if (pool < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ string snapname;
+ cmd_getval(cmdmap, "snap", snapname);
+ const pg_pool_t *p = osdmap.get_pg_pool(pool);
+ if (p->is_unmanaged_snaps_mode()) {
+ ss << "pool " << poolstr << " is in unmanaged snaps mode";
+ err = -EINVAL;
+ goto reply_no_propose;
+ } else if (!p->snap_exists(snapname.c_str())) {
+ ss << "pool " << poolstr << " snap " << snapname << " does not exist";
+ err = 0;
+ goto reply_no_propose;
+ }
+ pg_pool_t *pp = 0;
+ if (pending_inc.new_pools.count(pool))
+ pp = &pending_inc.new_pools[pool];
+ if (!pp) {
+ pp = &pending_inc.new_pools[pool];
+ *pp = *p;
+ }
+ if (remove_pool_snap(snapname, *pp, pool)) {
+ pp->set_snap_epoch(pending_inc.epoch);
+ ss << "removed pool " << poolstr << " snap " << snapname;
+ } else {
+ ss << "already removed pool " << poolstr << " snap " << snapname;
+ }
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd pool create") {
+ int64_t pg_num = cmd_getval_or<int64_t>(cmdmap, "pg_num", 0);
+ int64_t pg_num_min = cmd_getval_or<int64_t>(cmdmap, "pg_num_min", 0);
+ int64_t pg_num_max = cmd_getval_or<int64_t>(cmdmap, "pg_num_max", 0);
+ int64_t pgp_num = cmd_getval_or<int64_t>(cmdmap, "pgp_num", pg_num);
+ string pool_type_str;
+ cmd_getval(cmdmap, "pool_type", pool_type_str);
+ if (pool_type_str.empty())
+ pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
+
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ bool confirm = false;
+ //confirmation may be set to true only by internal operations.
+ cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
+ if (poolstr[0] == '.' && !confirm) {
+ ss << "pool names beginning with . are not allowed";
+ err = 0;
+ goto reply_no_propose;
+ }
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id >= 0) {
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ if (pool_type_str != p->get_type_name()) {
+ ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
+ err = -EINVAL;
+ } else {
+ ss << "pool '" << poolstr << "' already exists";
+ err = 0;
+ }
+ goto reply_no_propose;
+ }
+
+ int pool_type;
+ if (pool_type_str == "replicated") {
+ pool_type = pg_pool_t::TYPE_REPLICATED;
+ } else if (pool_type_str == "erasure") {
+ pool_type = pg_pool_t::TYPE_ERASURE;
+ } else {
+ ss << "unknown pool type '" << pool_type_str << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ bool implicit_rule_creation = false;
+ int64_t expected_num_objects = 0;
+ string rule_name;
+ cmd_getval(cmdmap, "rule", rule_name);
+ string erasure_code_profile;
+ cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
+
+ if (pool_type == pg_pool_t::TYPE_ERASURE) {
+ if (erasure_code_profile == "")
+ erasure_code_profile = "default";
+ //handle the erasure code profile
+ if (erasure_code_profile == "default") {
+ if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
+ if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
+ dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
+ goto wait;
+ }
+
+ map<string,string> profile_map;
+ err = osdmap.get_erasure_code_profile_default(cct,
+ profile_map,
+ &ss);
+ if (err)
+ goto reply_no_propose;
+ dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
+ pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
+ goto wait;
+ }
+ }
+ if (rule_name == "") {
+ implicit_rule_creation = true;
+ if (erasure_code_profile == "default") {
+ rule_name = "erasure-code";
+ } else {
+ dout(1) << "implicitly use rule named after the pool: "
+ << poolstr << dendl;
+ rule_name = poolstr;
+ }
+ }
+ expected_num_objects =
+ cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
+ } else {
+ //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
+ // and put expected_num_objects to rule field
+ if (erasure_code_profile != "") { // cmd is from CLI
+ if (rule_name != "") {
+ string interr;
+ expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
+ if (interr.length()) {
+ ss << "error parsing integer value '" << rule_name << "': " << interr;
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ }
+ rule_name = erasure_code_profile;
+ } else { // cmd is well-formed
+ expected_num_objects =
+ cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
+ }
+ }
+
+ if (!implicit_rule_creation && rule_name != "") {
+ int rule;
+ err = get_crush_rule(rule_name, &rule, &ss);
+ if (err == -EAGAIN) {
+ goto wait;
+ }
+ if (err)
+ goto reply_no_propose;
+ }
+
+ if (expected_num_objects < 0) {
+ ss << "'expected_num_objects' must be non-negative";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ int64_t fast_read_param = cmd_getval_or<int64_t>(cmdmap, "fast_read", -1);
+ FastReadType fast_read = FAST_READ_DEFAULT;
+ if (fast_read_param == 0)
+ fast_read = FAST_READ_OFF;
+ else if (fast_read_param > 0)
+ fast_read = FAST_READ_ON;
+
+ int64_t repl_size = 0;
+ cmd_getval(cmdmap, "size", repl_size);
+ int64_t target_size_bytes = 0;
+ double target_size_ratio = 0.0;
+ cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
+ cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
+
+ string pg_autoscale_mode;
+ cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
+
+ bool bulk = cmd_getval_or<bool>(cmdmap, "bulk", 0);
+
+ bool crimson = cmd_getval_or<bool>(cmdmap, "crimson", false) ||
+ cct->_conf.get_val<bool>("osd_pool_default_crimson");
+
+ err = prepare_new_pool(poolstr,
+ -1, // default crush rule
+ rule_name,
+ pg_num, pgp_num, pg_num_min, pg_num_max,
+ repl_size, target_size_bytes, target_size_ratio,
+ erasure_code_profile, pool_type,
+ (uint64_t)expected_num_objects,
+ fast_read,
+ pg_autoscale_mode,
+ bulk,
+ crimson,
+ &ss);
+ if (err < 0) {
+ switch(err) {
+ case -EEXIST:
+ ss << "pool '" << poolstr << "' already exists";
+ err = 0;
+ goto reply_no_propose;
+ case -EAGAIN:
+ goto wait;
+ case -ERANGE:
+ goto reply_no_propose;
+ default:
+ goto reply_no_propose;
+ }
+ } else {
+ ss << "pool '" << poolstr << "' created";
+ }
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd pool delete" ||
+ prefix == "osd pool rm") {
+ // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
+ string poolstr, poolstr2, sure;
+ cmd_getval(cmdmap, "pool", poolstr);
+ cmd_getval(cmdmap, "pool2", poolstr2);
+ int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+ if (pool < 0) {
+ ss << "pool '" << poolstr << "' does not exist";
+ err = 0;
+ goto reply_no_propose;
+ }
+
+ bool force_no_fake = false;
+ cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
+ bool force = false;
+ cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
+ if (poolstr2 != poolstr ||
+ (!force && !force_no_fake)) {
+ ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
+ << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
+ << "followed by --yes-i-really-really-mean-it.";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ err = _prepare_remove_pool(pool, &ss, force_no_fake);
+ if (err == -EAGAIN) {
+ goto wait;
+ }
+ if (err < 0)
+ goto reply_no_propose;
+ goto update;
+ } else if (prefix == "osd pool rename") {
+ string srcpoolstr, destpoolstr;
+ cmd_getval(cmdmap, "srcpool", srcpoolstr);
+ cmd_getval(cmdmap, "destpool", destpoolstr);
+ int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
+ int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
+ bool confirm = false;
+ //confirmation may be set to true only by internal operations.
+ cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
+ if (destpoolstr[0] == '.' && !confirm) {
+ ss << "pool names beginning with . are not allowed";
+ err = 0;
+ goto reply_no_propose;
+ }
+ if (pool_src < 0) {
+ if (pool_dst >= 0) {
+ // src pool doesn't exist, dst pool does exist: to ensure idempotency
+ // of operations, assume this rename succeeded, as it is not changing
+ // the current state. Make sure we output something understandable
+ // for whoever is issuing the command, if they are paying attention,
+ // in case it was not intentional; or to avoid a "wtf?" and a bug
+ // report in case it was intentional, while expecting a failure.
+ ss << "pool '" << srcpoolstr << "' does not exist; pool '"
+ << destpoolstr << "' does -- assuming successful rename";
+ err = 0;
+ } else {
+ ss << "unrecognized pool '" << srcpoolstr << "'";
+ err = -ENOENT;
+ }
+ goto reply_no_propose;
+ } else if (pool_dst >= 0) {
+ // source pool exists and so does the destination pool
+ ss << "pool '" << destpoolstr << "' already exists";
+ err = -EEXIST;
+ goto reply_no_propose;
+ }
+
+ int ret = _prepare_rename_pool(pool_src, destpoolstr);
+ if (ret == 0) {
+ ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
+ } else {
+ ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
+ << cpp_strerror(ret);
+ }
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, ret, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd pool set") {
+ err = prepare_command_pool_set(cmdmap, ss);
+ if (err == -EAGAIN)
+ goto wait;
+ if (err < 0)
+ goto reply_no_propose;
+
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd tier add") {
+ err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
+ if (err == -EAGAIN)
+ goto wait;
+ if (err)
+ goto reply_no_propose;
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ string tierpoolstr;
+ cmd_getval(cmdmap, "tierpool", tierpoolstr);
+ int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
+ if (tierpool_id < 0) {
+ ss << "unrecognized pool '" << tierpoolstr << "'";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ ceph_assert(p);
+ const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
+ ceph_assert(tp);
+
+ if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
+ goto reply_no_propose;
+ }
+
+ // make sure new tier is empty
+ bool force_nonempty = false;
+ cmd_getval_compat_cephbool(cmdmap, "force_nonempty", force_nonempty);
+ const pool_stat_t *pstats = mon.mgrstatmon()->get_pool_stat(tierpool_id);
+ if (pstats && pstats->stats.sum.num_objects != 0 &&
+ !force_nonempty) {
+ ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
+ err = -ENOTEMPTY;
+ goto reply_no_propose;
+ }
+ if (tp->is_erasure()) {
+ ss << "tier pool '" << tierpoolstr
+ << "' is an ec pool, which cannot be a tier";
+ err = -ENOTSUP;
+ goto reply_no_propose;
+ }
+ if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
+ (!force_nonempty ||
+ !g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps)) {
+ ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
+ err = -ENOTEMPTY;
+ goto reply_no_propose;
+ }
+ // go
+ pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+ pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
+ if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
+ goto wait;
+ }
+ np->tiers.insert(tierpool_id);
+ np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
+ ntp->tier_of = pool_id;
+ ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd tier remove" ||
+ prefix == "osd tier rm") {
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ string tierpoolstr;
+ cmd_getval(cmdmap, "tierpool", tierpoolstr);
+ int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
+ if (tierpool_id < 0) {
+ ss << "unrecognized pool '" << tierpoolstr << "'";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ ceph_assert(p);
+ const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
+ ceph_assert(tp);
+
+ if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
+ goto reply_no_propose;
+ }
+
+ if (p->tiers.count(tierpool_id) == 0) {
+ ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
+ err = 0;
+ goto reply_no_propose;
+ }
+ if (tp->tier_of != pool_id) {
+ ss << "tier pool '" << tierpoolstr << "' is a tier of '"
+ << osdmap.get_pool_name(tp->tier_of) << "': "
+ // be scary about it; this is an inconsistency and bells must go off
+ << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (p->read_tier == tierpool_id) {
+ ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
+ err = -EBUSY;
+ goto reply_no_propose;
+ }
+ // go
+ pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+ pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
+ if (np->tiers.count(tierpool_id) == 0 ||
+ ntp->tier_of != pool_id ||
+ np->read_tier == tierpool_id) {
+ goto wait;
+ }
+ np->tiers.erase(tierpool_id);
+ ntp->clear_tier();
+ ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd tier set-overlay") {
+ err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
+ if (err == -EAGAIN)
+ goto wait;
+ if (err)
+ goto reply_no_propose;
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ string overlaypoolstr;
+ cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
+ int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
+ if (overlaypool_id < 0) {
+ ss << "unrecognized pool '" << overlaypoolstr << "'";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ ceph_assert(p);
+ const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
+ ceph_assert(overlay_p);
+ if (p->tiers.count(overlaypool_id) == 0) {
+ ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if (p->read_tier == overlaypool_id) {
+ err = 0;
+ ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
+ goto reply_no_propose;
+ }
+ if (p->has_read_tier()) {
+ ss << "pool '" << poolstr << "' has overlay '"
+ << osdmap.get_pool_name(p->read_tier)
+ << "'; please remove-overlay first";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ // go
+ pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+ np->read_tier = overlaypool_id;
+ np->write_tier = overlaypool_id;
+ np->set_last_force_op_resend(pending_inc.epoch);
+ pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
+ noverlay_p->set_last_force_op_resend(pending_inc.epoch);
+ ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
+ if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
+ ss <<" (WARNING: overlay pool cache_mode is still NONE)";
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd tier remove-overlay" ||
+ prefix == "osd tier rm-overlay") {
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ ceph_assert(p);
+ if (!p->has_read_tier()) {
+ err = 0;
+ ss << "there is now (or already was) no overlay for '" << poolstr << "'";
+ goto reply_no_propose;
+ }
+
+ if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
+ goto reply_no_propose;
+ }
+
+ // go
+ pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+ if (np->has_read_tier()) {
+ const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
+ pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
+ nop->set_last_force_op_resend(pending_inc.epoch);
+ }
+ if (np->has_write_tier()) {
+ const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
+ pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
+ nop->set_last_force_op_resend(pending_inc.epoch);
+ }
+ np->clear_read_tier();
+ np->clear_write_tier();
+ np->set_last_force_op_resend(pending_inc.epoch);
+ ss << "there is now (or already was) no overlay for '" << poolstr << "'";
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd tier cache-mode") {
+ err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
+ if (err == -EAGAIN)
+ goto wait;
+ if (err)
+ goto reply_no_propose;
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ ceph_assert(p);
+ if (!p->is_tier()) {
+ ss << "pool '" << poolstr << "' is not a tier";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ string modestr;
+ cmd_getval(cmdmap, "mode", modestr);
+ pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
+ if (int(mode) < 0) {
+ ss << "'" << modestr << "' is not a valid cache mode";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+
+ if (mode == pg_pool_t::CACHEMODE_FORWARD ||
+ mode == pg_pool_t::CACHEMODE_READFORWARD) {
+ ss << "'" << modestr << "' is no longer a supported cache mode";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+ mode != pg_pool_t::CACHEMODE_NONE &&
+ mode != pg_pool_t::CACHEMODE_PROXY &&
+ mode != pg_pool_t::CACHEMODE_READPROXY) &&
+ !sure) {
+ ss << "'" << modestr << "' is not a well-supported cache mode and may "
+ << "corrupt your data. pass --yes-i-really-mean-it to force.";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+
+ // pool already has this cache-mode set and there are no pending changes
+ if (p->cache_mode == mode &&
+ (pending_inc.new_pools.count(pool_id) == 0 ||
+ pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
+ ss << "set cache-mode for pool '" << poolstr << "'"
+ << " to " << pg_pool_t::get_cache_mode_name(mode);
+ err = 0;
+ goto reply_no_propose;
+ }
+
+ /* Mode description:
+ *
+ * none: No cache-mode defined
+ * forward: Forward all reads and writes to base pool [removed]
+ * writeback: Cache writes, promote reads from base pool
+ * readonly: Forward writes to base pool
+ * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
+ * proxy: Proxy all reads and writes to base pool
+ * readproxy: Writes are in writeback mode, Reads are in proxy mode
+ *
+ * Hence, these are the allowed transitions:
+ *
+ * none -> any
+ * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
+ * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
+ * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
+ * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
+ * writeback -> readproxy || proxy
+ * readonly -> any
+ */
+
+ // We check if the transition is valid against the current pool mode, as
+ // it is the only committed state thus far. We will blantly squash
+ // whatever mode is on the pending state.
+
+ if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
+ (mode != pg_pool_t::CACHEMODE_PROXY &&
+ mode != pg_pool_t::CACHEMODE_READPROXY)) {
+ ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
+ << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
+ << "' pool; only '"
+ << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
+ << "','"
+ << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
+ << "' allowed.";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
+ (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+ mode != pg_pool_t::CACHEMODE_PROXY &&
+ mode != pg_pool_t::CACHEMODE_READPROXY)) ||
+
+ (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
+ (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+ mode != pg_pool_t::CACHEMODE_PROXY)) ||
+
+ (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
+ (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+ mode != pg_pool_t::CACHEMODE_READPROXY)) ||
+
+ (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
+ (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+ mode != pg_pool_t::CACHEMODE_PROXY &&
+ mode != pg_pool_t::CACHEMODE_READPROXY))) {
+
+ const pool_stat_t* pstats =
+ mon.mgrstatmon()->get_pool_stat(pool_id);
+
+ if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
+ ss << "unable to set cache-mode '"
+ << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
+ << "': dirty objects found";
+ err = -EBUSY;
+ goto reply_no_propose;
+ }
+ }
+ // go
+ pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+ np->cache_mode = mode;
+ // set this both when moving to and from cache_mode NONE. this is to
+ // capture legacy pools that were set up before this flag existed.
+ np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
+ ss << "set cache-mode for pool '" << poolstr
+ << "' to " << pg_pool_t::get_cache_mode_name(mode);
+ if (mode == pg_pool_t::CACHEMODE_NONE) {
+ const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
+ ceph_assert(base_pool);
+ if (base_pool->read_tier == pool_id ||
+ base_pool->write_tier == pool_id)
+ ss <<" (WARNING: pool is still configured as read or write tier)";
+ }
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd tier add-cache") {
+ err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
+ if (err == -EAGAIN)
+ goto wait;
+ if (err)
+ goto reply_no_propose;
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ string tierpoolstr;
+ cmd_getval(cmdmap, "tierpool", tierpoolstr);
+ int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
+ if (tierpool_id < 0) {
+ ss << "unrecognized pool '" << tierpoolstr << "'";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ ceph_assert(p);
+ const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
+ ceph_assert(tp);
+
+ if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
+ goto reply_no_propose;
+ }
+
+ int64_t size = 0;
+ if (!cmd_getval(cmdmap, "size", size)) {
+ ss << "unable to parse 'size' value '"
+ << cmd_vartype_stringify(cmdmap.at("size")) << "'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ // make sure new tier is empty
+ const pool_stat_t *pstats =
+ mon.mgrstatmon()->get_pool_stat(tierpool_id);
+ if (pstats && pstats->stats.sum.num_objects != 0) {
+ ss << "tier pool '" << tierpoolstr << "' is not empty";
+ err = -ENOTEMPTY;
+ goto reply_no_propose;
+ }
+ auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
+ pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
+ if (int(mode) < 0) {
+ ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ HitSet::Params hsp;
+ auto& cache_hit_set_type =
+ g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
+ if (cache_hit_set_type == "bloom") {
+ BloomHitSet::Params *bsp = new BloomHitSet::Params;
+ bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
+ hsp = HitSet::Params(bsp);
+ } else if (cache_hit_set_type == "explicit_hash") {
+ hsp = HitSet::Params(new ExplicitHashHitSet::Params);
+ } else if (cache_hit_set_type == "explicit_object") {
+ hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
+ } else {
+ ss << "osd tier cache default hit set type '"
+ << cache_hit_set_type << "' is not a known type";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+ // go
+ pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+ pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
+ if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
+ goto wait;
+ }
+ np->tiers.insert(tierpool_id);
+ np->read_tier = np->write_tier = tierpool_id;
+ np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
+ np->set_last_force_op_resend(pending_inc.epoch);
+ ntp->set_last_force_op_resend(pending_inc.epoch);
+ ntp->tier_of = pool_id;
+ ntp->cache_mode = mode;
+ ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
+ ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
+ ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
+ ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
+ ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
+ ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
+ ntp->hit_set_params = hsp;
+ ntp->target_max_bytes = size;
+ ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd pool set-quota") {
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply_no_propose;
+ }
+
+ string field;
+ cmd_getval(cmdmap, "field", field);
+ if (field != "max_objects" && field != "max_bytes") {
+ ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ // val could contain unit designations, so we treat as a string
+ string val;
+ cmd_getval(cmdmap, "val", val);
+ string tss;
+ int64_t value;
+ if (field == "max_objects") {
+ value = strict_si_cast<uint64_t>(val, &tss);
+ } else if (field == "max_bytes") {
+ value = strict_iecstrtoll(val, &tss);
+ } else {
+ ceph_abort_msg("unrecognized option");
+ }
+ if (!tss.empty()) {
+ ss << "error parsing value '" << val << "': " << tss;
+ err = -EINVAL;
+ goto reply_no_propose;
+ }
+
+ pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
+ if (field == "max_objects") {
+ pi->quota_max_objects = value;
+ } else if (field == "max_bytes") {
+ pi->quota_max_bytes = value;
+ } else {
+ ceph_abort_msg("unrecognized option");
+ }
+ ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
+ rs = ss.str();
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd pool application enable" ||
+ prefix == "osd pool application disable" ||
+ prefix == "osd pool application set" ||
+ prefix == "osd pool application rm") {
+ err = prepare_command_pool_application(prefix, cmdmap, ss);
+ if (err == -EAGAIN) {
+ goto wait;
+ } else if (err < 0) {
+ goto reply_no_propose;
+ } else {
+ goto update;
+ }
+ } else if (prefix == "osd force-create-pg") {
+ pg_t pgid;
+ string pgidstr;
+ err = parse_pgid(cmdmap, ss, pgid, pgidstr);
+ if (err < 0)
+ goto reply_no_propose;
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) {
+ ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
+ << "that the cluster will give up ever trying to recover the lost data. Do this "
+ << "only if you are certain that all copies of the PG are in fact lost and you are "
+ << "willing to accept that the data is permanently destroyed. Pass "
+ << "--yes-i-really-mean-it to proceed.";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ bool creating_now;
+ {
+ std::lock_guard<std::mutex> l(creating_pgs_lock);
+ auto emplaced = creating_pgs.pgs.emplace(
+ pgid,
+ creating_pgs_t::pg_create_info(osdmap.get_epoch(),
+ ceph_clock_now()));
+ creating_now = emplaced.second;
+ }
+ if (creating_now) {
+ ss << "pg " << pgidstr << " now creating, ok";
+ // set the pool's CREATING flag so that (1) the osd won't ignore our
+ // create message and (2) we won't propose any future pg_num changes
+ // until after the PG has been instantiated.
+ if (pending_inc.new_pools.count(pgid.pool()) == 0) {
+ pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
+ }
+ pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
+ err = 0;
+ goto update;
+ } else {
+ ss << "pg " << pgid << " already creating";
+ err = 0;
+ goto reply_no_propose;
+ }
+ } else if (prefix == "osd force_healthy_stretch_mode") {
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) {
+ ss << "This command will require peering across multiple CRUSH buckets "
+ "(probably two data centers or availability zones?) and may result in PGs "
+ "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ try_end_recovery_stretch_mode(true);
+ ss << "Triggering healthy stretch mode";
+ err = 0;
+ goto reply_no_propose;
+ } else if (prefix == "osd force_recovery_stretch_mode") {
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) {
+ ss << "This command will increase pool sizes to try and spread them "
+ "across multiple CRUSH buckets (probably two data centers or "
+ "availability zones?) and should have happened automatically"
+ "Pass --yes-i-really-mean-it to proceed.";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+ mon.go_recovery_stretch_mode();
+ ss << "Triggering recovery stretch mode";
+ err = 0;
+ goto reply_no_propose;
+ } else if (prefix == "osd set-allow-crimson") {
+
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+
+ bool experimental_enabled =
+ g_ceph_context->check_experimental_feature_enabled("crimson");
+ if (!sure || !experimental_enabled) {
+ ss << "This command will allow usage of crimson-osd osd daemons. "
+ << "crimson-osd is not considered stable and will likely cause "
+ << "crashes or data corruption. At this time, crimson-osd is mainly "
+ << "useful for performance evaluation, testing, and development. "
+ << "If you are sure, add --yes-i-really-mean-it and add 'crimson' to "
+ << "the experimental features config. This setting is irrevocable.";
+ err = -EPERM;
+ goto reply_no_propose;
+ }
+
+ err = 0;
+ if (osdmap.get_allow_crimson()) {
+ goto reply_no_propose;
+ } else {
+ pending_inc.set_allow_crimson();
+ goto update;
+ }
+ } else {
+ err = -EINVAL;
+ }
+
+ reply_no_propose:
+ getline(ss, rs);
+ if (err < 0 && rs.length() == 0)
+ rs = cpp_strerror(err);
+ mon.reply_command(op, err, rs, rdata, get_last_committed());
+ return false; /* nothing to propose */
+
+ update:
+ getline(ss, rs);
+ wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ wait:
+ // XXX
+ // Some osd commands split changes across two epochs.
+ // It seems this is mostly for crush rule changes. It doesn't need
+ // to be this way but it's a bit of work to fix that. For now,
+ // trigger a proposal by returning true and then retry the command
+ // to complete the operation.
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return true;
+}
+
+bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+
+ auto m = op->get_req<MPoolOp>();
+ MonSession *session = op->get_session();
+ if (!session) {
+ _pool_op_reply(op, -EPERM, osdmap.get_epoch());
+ return true;
+ }
+
+ switch (m->op) {
+ case POOL_OP_CREATE_UNMANAGED_SNAP:
+ case POOL_OP_DELETE_UNMANAGED_SNAP:
+ {
+ const std::string* pool_name = nullptr;
+ const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
+ if (pg_pool != nullptr) {
+ pool_name = &osdmap.get_pool_name(m->pool);
+ }
+
+ if (!is_unmanaged_snap_op_permitted(cct, mon.key_server,
+ session->entity_name, session->caps,
+ session->get_peer_socket_addr(),
+ pool_name)) {
+ dout(0) << "got unmanaged-snap pool op from entity with insufficient "
+ << "privileges. message: " << *m << std::endl
+ << "caps: " << session->caps << dendl;
+ _pool_op_reply(op, -EPERM, osdmap.get_epoch());
+ return true;
+ }
+ }
+ break;
+ default:
+ if (!session->is_capable("osd", MON_CAP_W)) {
+ dout(0) << "got pool op from entity with insufficient privileges. "
+ << "message: " << *m << std::endl
+ << "caps: " << session->caps << dendl;
+ _pool_op_reply(op, -EPERM, osdmap.get_epoch());
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MPoolOp>();
+
+ if (enforce_pool_op_caps(op)) {
+ return true;
+ }
+
+ if (m->fsid != mon.monmap->fsid) {
+ dout(0) << __func__ << " drop message on fsid " << m->fsid
+ << " != " << mon.monmap->fsid << " for " << *m << dendl;
+ _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+ return true;
+ }
+
+ if (m->op == POOL_OP_CREATE)
+ return preprocess_pool_op_create(op);
+
+ const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
+ if (p == nullptr) {
+ dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
+ if (m->op == POOL_OP_DELETE) {
+ _pool_op_reply(op, 0, osdmap.get_epoch());
+ } else {
+ _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
+ }
+ return true;
+ }
+
+ // check if the snap and snapname exist
+ bool snap_exists = false;
+ if (p->snap_exists(m->name.c_str()))
+ snap_exists = true;
+
+ switch (m->op) {
+ case POOL_OP_CREATE_SNAP:
+ if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
+ _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+ return true;
+ }
+ if (snap_exists) {
+ _pool_op_reply(op, 0, osdmap.get_epoch());
+ return true;
+ }
+ return false;
+ case POOL_OP_CREATE_UNMANAGED_SNAP:
+ if (p->is_pool_snaps_mode()) {
+ _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+ return true;
+ }
+ return false;
+ case POOL_OP_DELETE_SNAP:
+ if (p->is_unmanaged_snaps_mode()) {
+ _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+ return true;
+ }
+ if (!snap_exists) {
+ _pool_op_reply(op, 0, osdmap.get_epoch());
+ return true;
+ }
+ return false;
+ case POOL_OP_DELETE_UNMANAGED_SNAP:
+ if (p->is_pool_snaps_mode()) {
+ _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+ return true;
+ }
+ if (_is_removed_snap(m->pool, m->snapid)) {
+ _pool_op_reply(op, 0, osdmap.get_epoch());
+ return true;
+ }
+ return false;
+ case POOL_OP_DELETE:
+ if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
+ _pool_op_reply(op, 0, osdmap.get_epoch());
+ return true;
+ }
+ return false;
+ case POOL_OP_AUID_CHANGE:
+ return false;
+ default:
+ ceph_abort();
+ break;
+ }
+
+ return false;
+}
+
+bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
+{
+ if (!osdmap.have_pg_pool(pool)) {
+ dout(10) << __func__ << " pool " << pool << " snap " << snap
+ << " - pool dne" << dendl;
+ return true;
+ }
+ if (osdmap.in_removed_snaps_queue(pool, snap)) {
+ dout(10) << __func__ << " pool " << pool << " snap " << snap
+ << " - in osdmap removed_snaps_queue" << dendl;
+ return true;
+ }
+ snapid_t begin, end;
+ int r = lookup_purged_snap(pool, snap, &begin, &end);
+ if (r == 0) {
+ dout(10) << __func__ << " pool " << pool << " snap " << snap
+ << " - purged, [" << begin << "," << end << ")" << dendl;
+ return true;
+ }
+ return false;
+}
+
+bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
+{
+ if (pending_inc.old_pools.count(pool)) {
+ dout(10) << __func__ << " pool " << pool << " snap " << snap
+ << " - pool pending deletion" << dendl;
+ return true;
+ }
+ if (pending_inc.in_new_removed_snaps(pool, snap)) {
+ dout(10) << __func__ << " pool " << pool << " snap " << snap
+ << " - in pending new_removed_snaps" << dendl;
+ return true;
+ }
+ return false;
+}
+
+bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MPoolOp>();
+ int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
+ if (pool >= 0) {
+ _pool_op_reply(op, 0, osdmap.get_epoch());
+ return true;
+ }
+
+ return false;
+}
+
+bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MPoolOp>();
+ dout(10) << "prepare_pool_op " << *m << dendl;
+ if (m->op == POOL_OP_CREATE) {
+ return prepare_pool_op_create(op);
+ } else if (m->op == POOL_OP_DELETE) {
+ return prepare_pool_op_delete(op);
+ }
+
+ int ret = 0;
+ bool changed = false;
+
+ if (!osdmap.have_pg_pool(m->pool)) {
+ _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
+ return false;
+ }
+
+ const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
+
+ if (m->op == POOL_OP_CREATE_SNAP ||
+ m->op == POOL_OP_CREATE_UNMANAGED_SNAP) {
+ if (const auto& fsmap = mon.mdsmon()->get_fsmap(); fsmap.pool_in_use(m->pool)) {
+ dout(20) << "monitor-managed snapshots have been disabled for pools "
+ " attached to an fs - pool:" << m->pool << dendl;
+ _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
+ return false;
+ }
+ }
+
+ switch (m->op) {
+ case POOL_OP_CREATE_SNAP:
+ if (pool->is_tier()) {
+ ret = -EINVAL;
+ _pool_op_reply(op, ret, osdmap.get_epoch());
+ return false;
+ } // else, fall through
+ case POOL_OP_DELETE_SNAP:
+ if (!pool->is_unmanaged_snaps_mode()) {
+ bool snap_exists = pool->snap_exists(m->name.c_str());
+ if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
+ || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
+ ret = 0;
+ } else {
+ break;
+ }
+ } else {
+ ret = -EINVAL;
+ }
+ _pool_op_reply(op, ret, osdmap.get_epoch());
+ return false;
+
+ case POOL_OP_DELETE_UNMANAGED_SNAP:
+ // we won't allow removal of an unmanaged snapshot from a pool
+ // not in unmanaged snaps mode.
+ if (!pool->is_unmanaged_snaps_mode()) {
+ _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
+ return false;
+ }
+ /* fall-thru */
+ case POOL_OP_CREATE_UNMANAGED_SNAP:
+ // but we will allow creating an unmanaged snapshot on any pool
+ // as long as it is not in 'pool' snaps mode.
+ if (pool->is_pool_snaps_mode()) {
+ _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+ return false;
+ }
+ }
+
+ // projected pool info
+ pg_pool_t pp;
+ if (pending_inc.new_pools.count(m->pool))
+ pp = pending_inc.new_pools[m->pool];
+ else
+ pp = *osdmap.get_pg_pool(m->pool);
+
+ bufferlist reply_data;
+
+ // pool snaps vs unmanaged snaps are mutually exclusive
+ switch (m->op) {
+ case POOL_OP_CREATE_SNAP:
+ case POOL_OP_DELETE_SNAP:
+ if (pp.is_unmanaged_snaps_mode()) {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+
+ case POOL_OP_CREATE_UNMANAGED_SNAP:
+ case POOL_OP_DELETE_UNMANAGED_SNAP:
+ if (pp.is_pool_snaps_mode()) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ switch (m->op) {
+ case POOL_OP_CREATE_SNAP:
+ if (!pp.snap_exists(m->name.c_str())) {
+ pp.add_snap(m->name.c_str(), ceph_clock_now());
+ dout(10) << "create snap in pool " << m->pool << " " << m->name
+ << " seq " << pp.get_snap_epoch() << dendl;
+ changed = true;
+ }
+ break;
+
+ case POOL_OP_DELETE_SNAP:
+ {
+ if (remove_pool_snap(m->name, pp, m->pool)) {
+ changed = true;
+ }
+ }
+ break;
+
+ case POOL_OP_CREATE_UNMANAGED_SNAP:
+ {
+ uint64_t snapid = pp.add_unmanaged_snap(
+ osdmap.require_osd_release < ceph_release_t::octopus);
+ encode(snapid, reply_data);
+ changed = true;
+ }
+ break;
+
+ case POOL_OP_DELETE_UNMANAGED_SNAP:
+ if (!_is_removed_snap(m->pool, m->snapid) &&
+ !_is_pending_removed_snap(m->pool, m->snapid)) {
+ if (m->snapid > pp.get_snap_seq()) {
+ _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
+ return false;
+ }
+ pp.remove_unmanaged_snap(
+ m->snapid,
+ osdmap.require_osd_release < ceph_release_t::octopus);
+ pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
+ changed = true;
+ }
+ break;
+
+ case POOL_OP_AUID_CHANGE:
+ _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
+ return false;
+
+ default:
+ ceph_abort();
+ break;
+ }
+
+ if (changed) {
+ pp.set_snap_epoch(pending_inc.epoch);
+ pending_inc.new_pools[m->pool] = pp;
+ }
+
+ out:
+ wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
+ return true;
+}
+
+bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ int err = prepare_new_pool(op);
+ wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
+ return true;
+}
+
+int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
+ ostream *ss)
+{
+ const string& poolstr = osdmap.get_pool_name(pool_id);
+
+ // If the Pool is in use by CephFS, refuse to delete it
+ FSMap const &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
+ if (pending_fsmap.pool_in_use(pool_id)) {
+ *ss << "pool '" << poolstr << "' is in use by CephFS";
+ return -EBUSY;
+ }
+
+ if (pool.tier_of >= 0) {
+ *ss << "pool '" << poolstr << "' is a tier of '"
+ << osdmap.get_pool_name(pool.tier_of) << "'";
+ return -EBUSY;
+ }
+ if (!pool.tiers.empty()) {
+ *ss << "pool '" << poolstr << "' has tiers";
+ for(auto tier : pool.tiers) {
+ *ss << " " << osdmap.get_pool_name(tier);
+ }
+ return -EBUSY;
+ }
+
+ if (!g_conf()->mon_allow_pool_delete) {
+ *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
+ return -EPERM;
+ }
+
+ if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
+ *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
+ return -EPERM;
+ }
+
+ *ss << "pool '" << poolstr << "' removed";
+ return 0;
+}
+
+/**
+ * Check if it is safe to add a tier to a base pool
+ *
+ * @return
+ * True if the operation should proceed, false if we should abort here
+ * (abort doesn't necessarily mean error, could be idempotency)
+ */
+bool OSDMonitor::_check_become_tier(
+ const int64_t tier_pool_id, const pg_pool_t *tier_pool,
+ const int64_t base_pool_id, const pg_pool_t *base_pool,
+ int *err,
+ ostream *ss) const
+{
+ const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
+ const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
+
+ if (tier_pool->is_crimson()) {
+ *ss << "pool '" << tier_pool_name << "' is a crimson pool, tiering "
+ << "features are not supported";
+ *err = -EINVAL;
+ return false;
+ }
+ if (base_pool->is_crimson()) {
+ *ss << "pool '" << base_pool_name << "' is a crimson pool, tiering "
+ << "features are not supported";
+ *err = -EINVAL;
+ return false;
+ }
+
+ const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
+ if (pending_fsmap.pool_in_use(tier_pool_id)) {
+ *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
+ *err = -EBUSY;
+ return false;
+ }
+
+ if (base_pool->tiers.count(tier_pool_id)) {
+ ceph_assert(tier_pool->tier_of == base_pool_id);
+ *err = 0;
+ *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
+ << base_pool_name << "'";
+ return false;
+ }
+
+ if (base_pool->is_tier()) {
+ *ss << "pool '" << base_pool_name << "' is already a tier of '"
+ << osdmap.get_pool_name(base_pool->tier_of) << "', "
+ << "multiple tiers are not yet supported.";
+ *err = -EINVAL;
+ return false;
+ }
+
+ if (tier_pool->has_tiers()) {
+ *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
+ for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
+ it != tier_pool->tiers.end(); ++it)
+ *ss << "'" << osdmap.get_pool_name(*it) << "',";
+ *ss << " multiple tiers are not yet supported.";
+ *err = -EINVAL;
+ return false;
+ }
+
+ if (tier_pool->is_tier()) {
+ *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
+ << osdmap.get_pool_name(tier_pool->tier_of) << "'";
+ *err = -EINVAL;
+ return false;
+ }
+
+ *err = 0;
+ return true;
+}
+
+
+/**
+ * Check if it is safe to remove a tier from this base pool
+ *
+ * @return
+ * True if the operation should proceed, false if we should abort here
+ * (abort doesn't necessarily mean error, could be idempotency)
+ */
+bool OSDMonitor::_check_remove_tier(
+ const int64_t base_pool_id, const pg_pool_t *base_pool,
+ const pg_pool_t *tier_pool,
+ int *err, ostream *ss) const
+{
+ const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
+
+ // Apply CephFS-specific checks
+ const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
+ if (pending_fsmap.pool_in_use(base_pool_id)) {
+ if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
+ // If the underlying pool is erasure coded and does not allow EC
+ // overwrites, we can't permit the removal of the replicated tier that
+ // CephFS relies on to access it
+ *ss << "pool '" << base_pool_name <<
+ "' does not allow EC overwrites and is in use by CephFS"
+ " via its tier";
+ *err = -EBUSY;
+ return false;
+ }
+
+ if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
+ *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
+ "tier is still in use as a writeback cache. Change the cache "
+ "mode and flush the cache before removing it";
+ *err = -EBUSY;
+ return false;
+ }
+ }
+
+ *err = 0;
+ return true;
+}
+
+int OSDMonitor::_prepare_remove_pool(
+ int64_t pool, ostream *ss, bool no_fake)
+{
+ dout(10) << __func__ << " " << pool << dendl;
+ const pg_pool_t *p = osdmap.get_pg_pool(pool);
+ int r = _check_remove_pool(pool, *p, ss);
+ if (r < 0)
+ return r;
+
+ auto new_pool = pending_inc.new_pools.find(pool);
+ if (new_pool != pending_inc.new_pools.end()) {
+ // if there is a problem with the pending info, wait and retry
+ // this op.
+ const auto& p = new_pool->second;
+ int r = _check_remove_pool(pool, p, ss);
+ if (r < 0)
+ return -EAGAIN;
+ }
+
+ if (pending_inc.old_pools.count(pool)) {
+ dout(10) << __func__ << " " << pool << " already pending removal"
+ << dendl;
+ return 0;
+ }
+
+ if (g_conf()->mon_fake_pool_delete && !no_fake) {
+ string old_name = osdmap.get_pool_name(pool);
+ string new_name = old_name + "." + stringify(pool) + ".DELETED";
+ dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
+ << old_name << " -> " << new_name << dendl;
+ pending_inc.new_pool_names[pool] = new_name;
+ return 0;
+ }
+
+ // remove
+ pending_inc.old_pools.insert(pool);
+
+ // remove any pg_temp mappings for this pool
+ for (auto p = osdmap.pg_temp->begin();
+ p != osdmap.pg_temp->end();
+ ++p) {
+ if (p->first.pool() == pool) {
+ dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
+ << p->first << dendl;
+ pending_inc.new_pg_temp[p->first].clear();
+ }
+ }
+ // remove any primary_temp mappings for this pool
+ for (auto p = osdmap.primary_temp->begin();
+ p != osdmap.primary_temp->end();
+ ++p) {
+ if (p->first.pool() == pool) {
+ dout(10) << __func__ << " " << pool
+ << " removing obsolete primary_temp" << p->first << dendl;
+ pending_inc.new_primary_temp[p->first] = -1;
+ }
+ }
+ // remove any pg_upmap mappings for this pool
+ for (auto& p : osdmap.pg_upmap) {
+ if (p.first.pool() == pool) {
+ dout(10) << __func__ << " " << pool
+ << " removing obsolete pg_upmap "
+ << p.first << dendl;
+ pending_inc.old_pg_upmap.insert(p.first);
+ }
+ }
+ // remove any pending pg_upmap mappings for this pool
+ {
+ auto it = pending_inc.new_pg_upmap.begin();
+ while (it != pending_inc.new_pg_upmap.end()) {
+ if (it->first.pool() == pool) {
+ dout(10) << __func__ << " " << pool
+ << " removing pending pg_upmap "
+ << it->first << dendl;
+ it = pending_inc.new_pg_upmap.erase(it);
+ } else {
+ it++;
+ }
+ }
+ }
+ // remove any pg_upmap_items mappings for this pool
+ for (auto& p : osdmap.pg_upmap_items) {
+ if (p.first.pool() == pool) {
+ dout(10) << __func__ << " " << pool
+ << " removing obsolete pg_upmap_items " << p.first
+ << dendl;
+ pending_inc.old_pg_upmap_items.insert(p.first);
+ }
+ }
+ // remove any pending pg_upmap mappings for this pool
+ {
+ auto it = pending_inc.new_pg_upmap_items.begin();
+ while (it != pending_inc.new_pg_upmap_items.end()) {
+ if (it->first.pool() == pool) {
+ dout(10) << __func__ << " " << pool
+ << " removing pending pg_upmap_items "
+ << it->first << dendl;
+ it = pending_inc.new_pg_upmap_items.erase(it);
+ } else {
+ it++;
+ }
+ }
+ }
+
+ // remove any choose_args for this pool
+ CrushWrapper newcrush = _get_pending_crush();
+ if (newcrush.have_choose_args(pool)) {
+ dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
+ newcrush.rm_choose_args(pool);
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ }
+ return 0;
+}
+
+int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
+{
+ dout(10) << "_prepare_rename_pool " << pool << dendl;
+ if (pending_inc.old_pools.count(pool)) {
+ dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
+ return -ENOENT;
+ }
+ for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
+ p != pending_inc.new_pool_names.end();
+ ++p) {
+ if (p->second == newname && p->first != pool) {
+ return -EEXIST;
+ }
+ }
+
+ pending_inc.new_pool_names[pool] = newname;
+ return 0;
+}
+
+bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MPoolOp>();
+ ostringstream ss;
+ int ret = _prepare_remove_pool(m->pool, &ss, false);
+ if (ret == -EAGAIN) {
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return true;
+ }
+ if (ret < 0)
+ dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
+ wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
+ pending_inc.epoch));
+ return true;
+}
+
+void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
+ int ret, epoch_t epoch, bufferlist *blp)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MPoolOp>();
+ dout(20) << "_pool_op_reply " << ret << dendl;
+ MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
+ ret, epoch, get_last_committed(), blp);
+ mon.send_reply(op, reply);
+}
+
+void OSDMonitor::convert_pool_priorities(void)
+{
+ pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
+ int64_t max_prio = 0;
+ int64_t min_prio = 0;
+ for (const auto &i : osdmap.get_pools()) {
+ const auto &pool = i.second;
+
+ if (pool.opts.is_set(key)) {
+ int64_t prio = 0;
+ pool.opts.get(key, &prio);
+ if (prio > max_prio)
+ max_prio = prio;
+ if (prio < min_prio)
+ min_prio = prio;
+ }
+ }
+ if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
+ dout(20) << __func__ << " nothing to fix" << dendl;
+ return;
+ }
+ // Current pool priorities exceeds new maximum
+ for (const auto &i : osdmap.get_pools()) {
+ const auto pool_id = i.first;
+ pg_pool_t pool = i.second;
+
+ int64_t prio = 0;
+ pool.opts.get(key, &prio);
+ int64_t n;
+
+ if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
+ // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
+ n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
+ } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
+ // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
+ n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
+ } else {
+ continue;
+ }
+ if (n == 0) {
+ pool.opts.unset(key);
+ } else {
+ pool.opts.set(key, static_cast<int64_t>(n));
+ }
+ dout(10) << __func__ << " pool " << pool_id
+ << " recovery_priority adjusted "
+ << prio << " to " << n << dendl;
+ pool.last_change = pending_inc.epoch;
+ pending_inc.new_pools[pool_id] = pool;
+ }
+}
+
+void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
+ int *errcode,
+ set<pg_pool_t*>* pools,
+ const string& new_crush_rule)
+{
+ dout(20) << __func__ << dendl;
+ *okay = false;
+ int new_crush_rule_result = osdmap.crush->get_rule_id(new_crush_rule);
+ if (new_crush_rule_result < 0) {
+ ss << "unrecognized crush rule " << new_crush_rule_result;
+ *errcode = new_crush_rule_result;
+ return;
+ }
+ __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
+ for (const auto& pooli : osdmap.pools) {
+ int64_t poolid = pooli.first;
+ const pg_pool_t *p = &pooli.second;
+ if (!p->is_replicated()) {
+ ss << "stretched pools must be replicated; '" << osdmap.pool_name[poolid] << "' is erasure-coded";
+ *errcode = -EINVAL;
+ return;
+ }
+ uint8_t default_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
+ if ((p->get_size() != default_size ||
+ (p->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size))) &&
+ (p->get_crush_rule() != new_rule)) {
+ ss << "we currently require stretch mode pools start out with the"
+ " default size/min_size, which '" << osdmap.pool_name[poolid] << "' does not";
+ *errcode = -EINVAL;
+ return;
+ }
+ pg_pool_t *pp = pending_inc.get_new_pool(poolid, p);
+ // TODO: The part where we unconditionally copy the pools into pending_inc is bad
+ // the attempt may fail and then we have these pool updates...but they won't do anything
+ // if there is a failure, so if it's hard to change the interface, no need to bother
+ pools->insert(pp);
+ }
+ *okay = true;
+ return;
+}
+
+void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
+ int *errcode, bool commit,
+ const string& dividing_bucket,
+ uint32_t bucket_count,
+ const set<pg_pool_t*>& pools,
+ const string& new_crush_rule)
+{
+ dout(20) << __func__ << dendl;
+ *okay = false;
+ CrushWrapper crush = _get_pending_crush();
+ int dividing_id = -1;
+ if (auto type_id = crush.get_validated_type_id(dividing_bucket);
+ !type_id.has_value()) {
+ ss << dividing_bucket << " is not a valid crush bucket type";
+ *errcode = -ENOENT;
+ ceph_assert(!commit);
+ return;
+ } else {
+ dividing_id = *type_id;
+ }
+ vector<int> subtrees;
+ crush.get_subtree_of_type(dividing_id, &subtrees);
+ if (subtrees.size() != 2) {
+ ss << "there are " << subtrees.size() << dividing_bucket
+ << "'s in the cluster but stretch mode currently only works with 2!";
+ *errcode = -EINVAL;
+ ceph_assert(!commit || subtrees.size() == 2);
+ return;
+ }
+
+ int new_crush_rule_result = crush.get_rule_id(new_crush_rule);
+ if (new_crush_rule_result < 0) {
+ ss << "unrecognized crush rule " << new_crush_rule;
+ *errcode = new_crush_rule_result;
+ ceph_assert(!commit || (new_crush_rule_result > 0));
+ return;
+ }
+ __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
+
+ int weight1 = crush.get_item_weight(subtrees[0]);
+ int weight2 = crush.get_item_weight(subtrees[1]);
+ if (weight1 != weight2) {
+ // TODO: I'm really not sure this is a good idea?
+ ss << "the 2 " << dividing_bucket
+ << "instances in the cluster have differing weights "
+ << weight1 << " and " << weight2
+ <<" but stretch mode currently requires they be the same!";
+ *errcode = -EINVAL;
+ ceph_assert(!commit || (weight1 == weight2));
+ return;
+ }
+ if (bucket_count != 2) {
+ ss << "currently we only support 2-site stretch clusters!";
+ *errcode = -EINVAL;
+ ceph_assert(!commit || bucket_count == 2);
+ return;
+ }
+ // TODO: check CRUSH rules for pools so that we are appropriately divided
+ if (commit) {
+ for (auto pool : pools) {
+ pool->crush_rule = new_rule;
+ pool->peering_crush_bucket_count = bucket_count;
+ pool->peering_crush_bucket_target = bucket_count;
+ pool->peering_crush_bucket_barrier = dividing_id;
+ pool->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
+ pool->size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
+ pool->min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
+ }
+ pending_inc.change_stretch_mode = true;
+ pending_inc.stretch_mode_enabled = true;
+ pending_inc.new_stretch_bucket_count = bucket_count;
+ pending_inc.new_degraded_stretch_mode = 0;
+ pending_inc.new_stretch_mode_bucket = dividing_id;
+ }
+ *okay = true;
+ return;
+}
+
+bool OSDMonitor::check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
+ set<int> *really_down_buckets,
+ set<string> *really_down_mons)
+{
+ dout(20) << __func__ << " with dead mon zones " << dead_buckets << dendl;
+ ceph_assert(is_readable());
+ if (dead_buckets.empty()) return false;
+ set<int> down_cache;
+ bool really_down = false;
+ for (auto dbi : dead_buckets) {
+ const string& bucket_name = dbi.first;
+ ceph_assert(osdmap.crush->name_exists(bucket_name));
+ int bucket_id = osdmap.crush->get_item_id(bucket_name);
+ dout(20) << "Checking " << bucket_name << " id " << bucket_id
+ << " to see if OSDs are also down" << dendl;
+ bool subtree_down = osdmap.subtree_is_down(bucket_id, &down_cache);
+ if (subtree_down) {
+ dout(20) << "subtree is down!" << dendl;
+ really_down = true;
+ really_down_buckets->insert(bucket_id);
+ really_down_mons->insert(dbi.second.begin(), dbi.second.end());
+ }
+ }
+ dout(10) << "We determined CRUSH buckets " << *really_down_buckets
+ << " and mons " << *really_down_mons << " are really down" << dendl;
+ return really_down;
+}
+
+void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets,
+ const set<string>& live_zones)
+{
+ dout(20) << __func__ << dendl;
+ stretch_recovery_triggered.set_from_double(0); // reset this; we can't go clean now!
+ // update the general OSDMap changes
+ pending_inc.change_stretch_mode = true;
+ pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
+ pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
+ int new_site_count = osdmap.stretch_bucket_count - dead_buckets.size();
+ ceph_assert(new_site_count == 1); // stretch count 2!
+ pending_inc.new_degraded_stretch_mode = new_site_count;
+ pending_inc.new_recovering_stretch_mode = 0;
+ pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
+
+ // and then apply them to all the pg_pool_ts
+ ceph_assert(live_zones.size() == 1); // only support 2 zones now
+ const string& remaining_site_name = *(live_zones.begin());
+ ceph_assert(osdmap.crush->name_exists(remaining_site_name));
+ int remaining_site = osdmap.crush->get_item_id(remaining_site_name);
+ for (auto pgi : osdmap.pools) {
+ if (pgi.second.peering_crush_bucket_count) {
+ pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
+ newp.peering_crush_bucket_count = new_site_count;
+ newp.peering_crush_mandatory_member = remaining_site;
+ newp.min_size = pgi.second.min_size / 2; // only support 2 zones now
+ newp.set_last_force_op_resend(pending_inc.epoch);
+ }
+ }
+ propose_pending();
+}
+
+void OSDMonitor::trigger_recovery_stretch_mode()
+{
+ dout(20) << __func__ << dendl;
+ stretch_recovery_triggered.set_from_double(0); // reset this so we don't go full-active prematurely
+ pending_inc.change_stretch_mode = true;
+ pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
+ pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
+ pending_inc.new_degraded_stretch_mode = osdmap.degraded_stretch_mode;
+ pending_inc.new_recovering_stretch_mode = 1;
+ pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
+
+ for (auto pgi : osdmap.pools) {
+ if (pgi.second.peering_crush_bucket_count) {
+ pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
+ newp.set_last_force_op_resend(pending_inc.epoch);
+ }
+ }
+ propose_pending();
+}
+
+void OSDMonitor::set_degraded_stretch_mode()
+{
+ stretch_recovery_triggered.set_from_double(0);
+}
+
+void OSDMonitor::set_recovery_stretch_mode()
+{
+ if (stretch_recovery_triggered.is_zero()) {
+ stretch_recovery_triggered = ceph_clock_now();
+ }
+}
+
+void OSDMonitor::set_healthy_stretch_mode()
+{
+ stretch_recovery_triggered.set_from_double(0);
+}
+
+void OSDMonitor::notify_new_pg_digest()
+{
+ dout(20) << __func__ << dendl;
+ if (!stretch_recovery_triggered.is_zero()) {
+ try_end_recovery_stretch_mode(false);
+ }
+}
+
+struct CMonExitRecovery : public Context {
+ OSDMonitor *m;
+ bool force;
+ CMonExitRecovery(OSDMonitor *mon, bool f) : m(mon), force(f) {}
+ void finish(int r) {
+ m->try_end_recovery_stretch_mode(force);
+ }
+};
+
+void OSDMonitor::try_end_recovery_stretch_mode(bool force)
+{
+ dout(20) << __func__ << dendl;
+ if (!mon.is_leader()) return;
+ if (!mon.is_degraded_stretch_mode()) return;
+ if (!mon.is_recovering_stretch_mode()) return;
+ if (!is_readable()) {
+ wait_for_readable_ctx(new CMonExitRecovery(this, force));
+ return;
+ }
+
+ if (osdmap.recovering_stretch_mode &&
+ ((!stretch_recovery_triggered.is_zero() &&
+ ceph_clock_now() - g_conf().get_val<double>("mon_stretch_recovery_min_wait") >
+ stretch_recovery_triggered) ||
+ force)) {
+ if (!mon.mgrstatmon()->is_readable()) {
+ mon.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force));
+ return;
+ }
+ const PGMapDigest& pgd = mon.mgrstatmon()->get_digest();
+ double misplaced, degraded, inactive, unknown;
+ pgd.get_recovery_stats(&misplaced, &degraded, &inactive, &unknown);
+ if (force || (degraded == 0.0 && inactive == 0.0 && unknown == 0.0)) {
+ // we can exit degraded stretch mode!
+ mon.trigger_healthy_stretch_mode();
+ }
+ }
+}
+
+void OSDMonitor::trigger_healthy_stretch_mode()
+{
+ ceph_assert(is_writeable());
+ stretch_recovery_triggered.set_from_double(0);
+ pending_inc.change_stretch_mode = true;
+ pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
+ pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
+ pending_inc.new_degraded_stretch_mode = 0; // turn off degraded mode...
+ pending_inc.new_recovering_stretch_mode = 0; //...and recovering mode!
+ pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
+ for (auto pgi : osdmap.pools) {
+ if (pgi.second.peering_crush_bucket_count) {
+ pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
+ newp.peering_crush_bucket_count = osdmap.stretch_bucket_count;
+ newp.peering_crush_mandatory_member = CRUSH_ITEM_NONE;
+ newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
+ newp.set_last_force_op_resend(pending_inc.epoch);
+ }
+ }
+ propose_pending();
+}
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
new file mode 100644
index 000000000..f522f2b4c
--- /dev/null
+++ b/src/mon/OSDMonitor.h
@@ -0,0 +1,883 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/* Object Store Device (OSD) Monitor
+ */
+
+#ifndef CEPH_OSDMONITOR_H
+#define CEPH_OSDMONITOR_H
+
+#include <map>
+#include <set>
+#include <utility>
+#include <sstream>
+
+#include "include/types.h"
+#include "include/encoding.h"
+#include "common/simple_cache.hpp"
+#include "common/PriorityCache.h"
+#include "msg/Messenger.h"
+
+#include "osd/OSDMap.h"
+#include "osd/OSDMapMapping.h"
+
+#include "CreatingPGs.h"
+#include "PaxosService.h"
+
+#include "erasure-code/ErasureCodeInterface.h"
+#include "mon/MonOpRequest.h"
+#include <boost/functional/hash.hpp>
+
+class Monitor;
+class PGMap;
+struct MonSession;
+class MOSDMap;
+
+
+/// information about a particular peer's failure reports for one osd
+struct failure_reporter_t {
+ utime_t failed_since; ///< when they think it failed
+ MonOpRequestRef op; ///< failure op request
+
+ failure_reporter_t() {}
+ failure_reporter_t(utime_t s, MonOpRequestRef op)
+ : failed_since(s), op(op) {}
+ ~failure_reporter_t() { }
+};
+
+/// information about all failure reports for one osd
+struct failure_info_t {
+ std::map<int, failure_reporter_t> reporters; ///< reporter -> failed_since etc
+ utime_t max_failed_since; ///< most recent failed_since
+
+ failure_info_t() {}
+
+ utime_t get_failed_since() {
+ if (max_failed_since == utime_t() && !reporters.empty()) {
+ // the old max must have canceled; recalculate.
+ for (auto p = reporters.begin(); p != reporters.end(); ++p)
+ if (p->second.failed_since > max_failed_since)
+ max_failed_since = p->second.failed_since;
+ }
+ return max_failed_since;
+ }
+
+ // set the message for the latest report.
+ void add_report(int who, utime_t failed_since, MonOpRequestRef op) {
+ [[maybe_unused]] auto [it, new_reporter] =
+ reporters.insert_or_assign(who, failure_reporter_t{failed_since, op});
+ if (new_reporter) {
+ if (max_failed_since != utime_t() && max_failed_since < failed_since) {
+ max_failed_since = failed_since;
+ }
+ }
+ }
+
+ void take_report_messages(std::list<MonOpRequestRef>& ls) {
+ for (auto p = reporters.begin(); p != reporters.end(); ++p) {
+ if (p->second.op) {
+ ls.push_back(p->second.op);
+ p->second.op.reset();
+ }
+ }
+ }
+
+ void cancel_report(int who) {
+ reporters.erase(who);
+ max_failed_since = utime_t();
+ }
+};
+
+
+class LastEpochClean {
+ struct Lec {
+ std::vector<epoch_t> epoch_by_pg;
+ ps_t next_missing = 0;
+ epoch_t floor = std::numeric_limits<epoch_t>::max();
+ void report(unsigned pg_num, ps_t pg, epoch_t last_epoch_clean);
+ };
+ std::map<uint64_t, Lec> report_by_pool;
+public:
+ void report(unsigned pg_num, const pg_t& pg, epoch_t last_epoch_clean);
+ void remove_pool(uint64_t pool);
+ epoch_t get_lower_bound(const OSDMap& latest) const;
+
+ void dump(Formatter *f) const;
+};
+
+
+struct osdmap_manifest_t {
+ // all the maps we have pinned -- i.e., won't be removed unless
+ // they are inside a trim interval.
+ std::set<version_t> pinned;
+
+ osdmap_manifest_t() {}
+
+ version_t get_last_pinned() const
+ {
+ auto it = pinned.crbegin();
+ if (it == pinned.crend()) {
+ return 0;
+ }
+ return *it;
+ }
+
+ version_t get_first_pinned() const
+ {
+ auto it = pinned.cbegin();
+ if (it == pinned.cend()) {
+ return 0;
+ }
+ return *it;
+ }
+
+ bool is_pinned(version_t v) const
+ {
+ return pinned.find(v) != pinned.end();
+ }
+
+ void pin(version_t v)
+ {
+ pinned.insert(v);
+ }
+
+ version_t get_lower_closest_pinned(version_t v) const {
+ auto p = pinned.lower_bound(v);
+ if (p == pinned.cend()) {
+ return 0;
+ } else if (*p > v) {
+ if (p == pinned.cbegin()) {
+ return 0;
+ }
+ --p;
+ }
+ return *p;
+ }
+
+ void encode(ceph::buffer::list& bl) const
+ {
+ ENCODE_START(1, 1, bl);
+ encode(pinned, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(ceph::buffer::list::const_iterator& bl)
+ {
+ DECODE_START(1, bl);
+ decode(pinned, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void decode(ceph::buffer::list& bl) {
+ auto p = bl.cbegin();
+ decode(p);
+ }
+
+ void dump(ceph::Formatter *f) {
+ f->dump_unsigned("first_pinned", get_first_pinned());
+ f->dump_unsigned("last_pinned", get_last_pinned());
+ f->open_array_section("pinned_maps");
+ for (auto& i : pinned) {
+ f->dump_unsigned("epoch", i);
+ }
+ f->close_section();
+ }
+};
+WRITE_CLASS_ENCODER(osdmap_manifest_t);
+
+class OSDMonitor : public PaxosService,
+ public md_config_obs_t {
+ CephContext *cct;
+
+public:
+ OSDMap osdmap;
+
+ // config observer
+ const char** get_tracked_conf_keys() const override;
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed) override;
+ // [leader]
+ OSDMap::Incremental pending_inc;
+ std::map<int, ceph::buffer::list> pending_metadata;
+ std::set<int> pending_metadata_rm;
+ std::map<int, failure_info_t> failure_info;
+ std::map<int,utime_t> down_pending_out; // osd down -> out
+ bool priority_convert = false;
+ std::shared_ptr<PriorityCache::PriCache> rocksdb_binned_kv_cache = nullptr;
+ std::shared_ptr<PriorityCache::Manager> pcm = nullptr;
+ ceph::mutex balancer_lock = ceph::make_mutex("OSDMonitor::balancer_lock");
+
+ std::map<int,double> osd_weight;
+
+ using osdmap_key_t = std::pair<version_t, uint64_t>;
+ using osdmap_cache_t = SimpleLRU<osdmap_key_t,
+ ceph::buffer::list,
+ std::less<osdmap_key_t>,
+ boost::hash<osdmap_key_t>>;
+ osdmap_cache_t inc_osd_cache;
+ osdmap_cache_t full_osd_cache;
+
+ bool has_osdmap_manifest;
+ osdmap_manifest_t osdmap_manifest;
+
+ bool check_failures(utime_t now);
+ bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
+ utime_t get_grace_time(utime_t now, int target_osd, failure_info_t& fi) const;
+ bool is_failure_stale(utime_t now, failure_info_t& fi) const;
+ void force_failure(int target_osd, int by);
+
+ bool _have_pending_crush();
+ CrushWrapper &_get_stable_crush();
+ CrushWrapper _get_pending_crush();
+
+ enum FastReadType {
+ FAST_READ_OFF,
+ FAST_READ_ON,
+ FAST_READ_DEFAULT
+ };
+
+ struct CleanUpmapJob : public ParallelPGMapper::Job {
+ CephContext *cct;
+ const OSDMap& osdmap;
+ OSDMap::Incremental& pending_inc;
+ // lock to protect pending_inc form changing
+ // when checking is done
+ ceph::mutex pending_inc_lock =
+ ceph::make_mutex("CleanUpmapJob::pending_inc_lock");
+
+ CleanUpmapJob(CephContext *cct, const OSDMap& om, OSDMap::Incremental& pi)
+ : ParallelPGMapper::Job(&om),
+ cct(cct),
+ osdmap(om),
+ pending_inc(pi) {}
+
+ void process(const std::vector<pg_t>& to_check) override {
+ std::vector<pg_t> to_cancel;
+ std::map<pg_t, mempool::osdmap::vector<std::pair<int,int>>> to_remap;
+ osdmap.check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
+ // don't bother taking lock if nothing changes
+ if (!to_cancel.empty() || !to_remap.empty()) {
+ std::lock_guard l(pending_inc_lock);
+ osdmap.clean_pg_upmaps(cct, &pending_inc, to_cancel, to_remap);
+ }
+ }
+
+ void process(int64_t poolid, unsigned ps_begin, unsigned ps_end) override {}
+ void complete() override {}
+ }; // public as this will need to be accessible from TestTestOSDMap.cc
+
+ // svc
+public:
+ void create_initial() override;
+ void get_store_prefixes(std::set<std::string>& s) const override;
+
+private:
+ void update_from_paxos(bool *need_bootstrap) override;
+ void create_pending() override; // prepare a new pending
+ void encode_pending(MonitorDBStore::TransactionRef t) override;
+ void on_active() override;
+ void on_restart() override;
+ void on_shutdown() override;
+
+ /* osdmap full map prune */
+ void load_osdmap_manifest();
+ bool should_prune() const;
+ void _prune_update_trimmed(
+ MonitorDBStore::TransactionRef tx,
+ version_t first);
+ void prune_init(osdmap_manifest_t& manifest);
+ bool _prune_sanitize_options() const;
+ bool is_prune_enabled() const;
+ bool is_prune_supported() const;
+ bool do_prune(MonitorDBStore::TransactionRef tx);
+
+ // Priority cache control
+ uint32_t mon_osd_cache_size = 0; ///< Number of cached OSDMaps
+ uint64_t rocksdb_cache_size = 0; ///< Cache for kv Db
+ double cache_kv_ratio = 0; ///< Cache ratio dedicated to kv
+ double cache_inc_ratio = 0; ///< Cache ratio dedicated to inc
+ double cache_full_ratio = 0; ///< Cache ratio dedicated to full
+ uint64_t mon_memory_base = 0; ///< Mon base memory for cache autotuning
+ double mon_memory_fragmentation = 0; ///< Expected memory fragmentation
+ uint64_t mon_memory_target = 0; ///< Mon target memory for cache autotuning
+ uint64_t mon_memory_min = 0; ///< Min memory to cache osdmaps
+ bool mon_memory_autotune = false; ///< Cache auto tune setting
+ int register_cache_with_pcm();
+ int _set_cache_sizes();
+ int _set_cache_ratios();
+ void _set_new_cache_sizes();
+ void _set_cache_autotuning();
+ int _update_mon_cache_settings();
+
+ friend struct OSDMemCache;
+ friend struct IncCache;
+ friend struct FullCache;
+
+ /**
+ * we haven't delegated full version stashing to paxosservice for some time
+ * now, making this function useless in current context.
+ */
+ void encode_full(MonitorDBStore::TransactionRef t) override { }
+ /**
+ * do not let paxosservice periodically stash full osdmaps, or we will break our
+ * locally-managed full maps. (update_from_paxos loads the latest and writes them
+ * out going forward from there, but if we just synced that may mean we skip some.)
+ */
+ bool should_stash_full() override {
+ return false;
+ }
+
+ /**
+ * hook into trim to include the oldest full map in the trim transaction
+ *
+ * This ensures that anyone post-sync will have enough to rebuild their
+ * full osdmaps.
+ */
+ void encode_trim_extra(MonitorDBStore::TransactionRef tx, version_t first) override;
+
+ void update_msgr_features();
+ /**
+ * check if the cluster supports the features required by the
+ * given crush map. Outputs the daemons which don't support it
+ * to the stringstream.
+ *
+ * @returns true if the map is passable, false otherwise
+ */
+ bool validate_crush_against_features(const CrushWrapper *newcrush,
+ std::ostream &ss);
+ void check_osdmap_subs();
+ void share_map_with_random_osd();
+
+ ceph::mutex prime_pg_temp_lock =
+ ceph::make_mutex("OSDMonitor::prime_pg_temp_lock");
+ struct PrimeTempJob : public ParallelPGMapper::Job {
+ OSDMonitor *osdmon;
+ PrimeTempJob(const OSDMap& om, OSDMonitor *m)
+ : ParallelPGMapper::Job(&om), osdmon(m) {}
+ void process(int64_t pool, unsigned ps_begin, unsigned ps_end) override {
+ for (unsigned ps = ps_begin; ps < ps_end; ++ps) {
+ pg_t pgid(ps, pool);
+ osdmon->prime_pg_temp(*osdmap, pgid);
+ }
+ }
+ void process(const std::vector<pg_t>& pgs) override {}
+ void complete() override {}
+ };
+ void maybe_prime_pg_temp();
+ void prime_pg_temp(const OSDMap& next, pg_t pgid);
+
+ ParallelPGMapper mapper; ///< for background pg work
+ OSDMapMapping mapping; ///< pg <-> osd mappings
+ std::unique_ptr<ParallelPGMapper::Job> mapping_job; ///< background mapping job
+ void start_mapping();
+
+ void update_logger();
+
+ void handle_query(PaxosServiceMessage *m);
+ bool preprocess_query(MonOpRequestRef op) override; // true if processed.
+ bool prepare_update(MonOpRequestRef op) override;
+ bool should_propose(double &delay) override;
+
+ version_t get_trim_to() const override;
+
+ bool can_mark_down(int o);
+ bool can_mark_up(int o);
+ bool can_mark_out(int o);
+ bool can_mark_in(int o);
+
+ // ...
+ MOSDMap *build_latest_full(uint64_t features);
+ MOSDMap *build_incremental(epoch_t first, epoch_t last, uint64_t features);
+ void send_full(MonOpRequestRef op);
+ void send_incremental(MonOpRequestRef op, epoch_t first);
+
+ bool remove_pool_snap(std::string_view snapname,
+ pg_pool_t &pp, int64_t pool);
+
+public:
+ /**
+ * Make sure the existing (up) OSDs support the given features
+ * @return 0 on success, or an error code if any OSDs re missing features.
+ * @param ss Filled in with ane explanation of failure, if any
+ */
+ int check_cluster_features(uint64_t features, std::stringstream &ss);
+ // @param req an optional op request, if the osdmaps are replies to it. so
+ // @c Monitor::send_reply() can mark_event with it.
+ void send_incremental(epoch_t first, MonSession *session, bool onetime,
+ MonOpRequestRef req = MonOpRequestRef());
+
+private:
+ void print_utilization(std::ostream &out, ceph::Formatter *f, bool tree) const;
+
+ bool check_source(MonOpRequestRef op, uuid_d fsid);
+
+ bool preprocess_get_osdmap(MonOpRequestRef op);
+
+ bool preprocess_mark_me_down(MonOpRequestRef op);
+
+ friend class C_AckMarkedDown;
+ bool preprocess_failure(MonOpRequestRef op);
+ bool prepare_failure(MonOpRequestRef op);
+ bool prepare_mark_me_down(MonOpRequestRef op);
+ void process_failures();
+ void take_all_failures(std::list<MonOpRequestRef>& ls);
+
+ bool preprocess_mark_me_dead(MonOpRequestRef op);
+ bool prepare_mark_me_dead(MonOpRequestRef op);
+
+ bool preprocess_full(MonOpRequestRef op);
+ bool prepare_full(MonOpRequestRef op);
+
+ bool preprocess_boot(MonOpRequestRef op);
+ bool prepare_boot(MonOpRequestRef op);
+ void _booted(MonOpRequestRef op, bool logit);
+
+ void update_up_thru(int from, epoch_t up_thru);
+ bool preprocess_alive(MonOpRequestRef op);
+ bool prepare_alive(MonOpRequestRef op);
+ void _reply_map(MonOpRequestRef op, epoch_t e);
+
+ bool preprocess_pgtemp(MonOpRequestRef op);
+ bool prepare_pgtemp(MonOpRequestRef op);
+
+ bool preprocess_pg_created(MonOpRequestRef op);
+ bool prepare_pg_created(MonOpRequestRef op);
+
+ bool preprocess_pg_ready_to_merge(MonOpRequestRef op);
+ bool prepare_pg_ready_to_merge(MonOpRequestRef op);
+
+ int _check_remove_pool(int64_t pool_id, const pg_pool_t &pool, std::ostream *ss);
+ bool _check_become_tier(
+ int64_t tier_pool_id, const pg_pool_t *tier_pool,
+ int64_t base_pool_id, const pg_pool_t *base_pool,
+ int *err, std::ostream *ss) const;
+ bool _check_remove_tier(
+ int64_t base_pool_id, const pg_pool_t *base_pool, const pg_pool_t *tier_pool,
+ int *err, std::ostream *ss) const;
+
+ int _prepare_remove_pool(int64_t pool, std::ostream *ss, bool no_fake);
+ int _prepare_rename_pool(int64_t pool, std::string newname);
+
+ bool enforce_pool_op_caps(MonOpRequestRef op);
+ bool preprocess_pool_op (MonOpRequestRef op);
+ bool preprocess_pool_op_create (MonOpRequestRef op);
+ bool prepare_pool_op (MonOpRequestRef op);
+ bool prepare_pool_op_create (MonOpRequestRef op);
+ bool prepare_pool_op_delete(MonOpRequestRef op);
+ int crush_rename_bucket(const std::string& srcname,
+ const std::string& dstname,
+ std::ostream *ss);
+ void check_legacy_ec_plugin(const std::string& plugin,
+ const std::string& profile) const;
+ int normalize_profile(const std::string& profilename,
+ ceph::ErasureCodeProfile &profile,
+ bool force,
+ std::ostream *ss);
+ int crush_rule_create_erasure(const std::string &name,
+ const std::string &profile,
+ int *rule,
+ std::ostream *ss);
+ int get_crush_rule(const std::string &rule_name,
+ int *crush_rule,
+ std::ostream *ss);
+ int get_erasure_code(const std::string &erasure_code_profile,
+ ceph::ErasureCodeInterfaceRef *erasure_code,
+ std::ostream *ss) const;
+ int prepare_pool_crush_rule(const unsigned pool_type,
+ const std::string &erasure_code_profile,
+ const std::string &rule_name,
+ int *crush_rule,
+ std::ostream *ss);
+ bool erasure_code_profile_in_use(
+ const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
+ const std::string &profile,
+ std::ostream *ss);
+ int parse_erasure_code_profile(const std::vector<std::string> &erasure_code_profile,
+ std::map<std::string,std::string> *erasure_code_profile_map,
+ std::ostream *ss);
+ int prepare_pool_size(const unsigned pool_type,
+ const std::string &erasure_code_profile,
+ uint8_t repl_size,
+ unsigned *size, unsigned *min_size,
+ std::ostream *ss);
+ int prepare_pool_stripe_width(const unsigned pool_type,
+ const std::string &erasure_code_profile,
+ unsigned *stripe_width,
+ std::ostream *ss);
+ uint32_t get_osd_num_by_crush(int crush_rule);
+ int check_pg_num(int64_t pool, int pg_num, int size, int crush_rule, std::ostream* ss);
+ int prepare_new_pool(std::string& name,
+ int crush_rule,
+ const std::string &crush_rule_name,
+ unsigned pg_num, unsigned pgp_num,
+ unsigned pg_num_min,
+ unsigned pg_num_max,
+ uint64_t repl_size,
+ const uint64_t target_size_bytes,
+ const float target_size_ratio,
+ const std::string &erasure_code_profile,
+ const unsigned pool_type,
+ const uint64_t expected_num_objects,
+ FastReadType fast_read,
+ std::string pg_autoscale_mode,
+ bool bulk,
+ bool crimson,
+ std::ostream *ss);
+ int prepare_new_pool(MonOpRequestRef op);
+
+ void set_pool_flags(int64_t pool_id, uint64_t flags);
+ void clear_pool_flags(int64_t pool_id, uint64_t flags);
+ bool update_pools_status();
+
+ bool _is_removed_snap(int64_t pool_id, snapid_t snapid);
+ bool _is_pending_removed_snap(int64_t pool_id, snapid_t snapid);
+
+ std::string make_purged_snap_epoch_key(epoch_t epoch);
+ std::string make_purged_snap_key(int64_t pool, snapid_t snap);
+ std::string make_purged_snap_key_value(int64_t pool, snapid_t snap, snapid_t num,
+ epoch_t epoch, ceph::buffer::list *v);
+
+ bool try_prune_purged_snaps();
+ int lookup_purged_snap(int64_t pool, snapid_t snap,
+ snapid_t *begin, snapid_t *end);
+
+ void insert_purged_snap_update(
+ int64_t pool,
+ snapid_t start, snapid_t end,
+ epoch_t epoch,
+ MonitorDBStore::TransactionRef t);
+
+ bool prepare_set_flag(MonOpRequestRef op, int flag);
+ bool prepare_unset_flag(MonOpRequestRef op, int flag);
+
+ void _pool_op_reply(MonOpRequestRef op,
+ int ret, epoch_t epoch, ceph::buffer::list *blp=NULL);
+
+ struct C_Booted : public C_MonOp {
+ OSDMonitor *cmon;
+ bool logit;
+ C_Booted(OSDMonitor *cm, MonOpRequestRef op_, bool l=true) :
+ C_MonOp(op_), cmon(cm), logit(l) {}
+ void _finish(int r) override {
+ if (r >= 0)
+ cmon->_booted(op, logit);
+ else if (r == -ECANCELED)
+ return;
+ else if (r == -EAGAIN)
+ cmon->dispatch(op);
+ else
+ ceph_abort_msg("bad C_Booted return value");
+ }
+ };
+
+ struct C_ReplyMap : public C_MonOp {
+ OSDMonitor *osdmon;
+ epoch_t e;
+ C_ReplyMap(OSDMonitor *o, MonOpRequestRef op_, epoch_t ee)
+ : C_MonOp(op_), osdmon(o), e(ee) {}
+ void _finish(int r) override {
+ if (r >= 0)
+ osdmon->_reply_map(op, e);
+ else if (r == -ECANCELED)
+ return;
+ else if (r == -EAGAIN)
+ osdmon->dispatch(op);
+ else
+ ceph_abort_msg("bad C_ReplyMap return value");
+ }
+ };
+ struct C_PoolOp : public C_MonOp {
+ OSDMonitor *osdmon;
+ int replyCode;
+ int epoch;
+ ceph::buffer::list reply_data;
+ C_PoolOp(OSDMonitor * osd, MonOpRequestRef op_, int rc, int e, ceph::buffer::list *rd=NULL) :
+ C_MonOp(op_), osdmon(osd), replyCode(rc), epoch(e) {
+ if (rd)
+ reply_data = *rd;
+ }
+ void _finish(int r) override {
+ if (r >= 0)
+ osdmon->_pool_op_reply(op, replyCode, epoch, &reply_data);
+ else if (r == -ECANCELED)
+ return;
+ else if (r == -EAGAIN)
+ osdmon->dispatch(op);
+ else
+ ceph_abort_msg("bad C_PoolOp return value");
+ }
+ };
+
+ bool preprocess_remove_snaps(MonOpRequestRef op);
+ bool prepare_remove_snaps(MonOpRequestRef op);
+
+ bool preprocess_get_purged_snaps(MonOpRequestRef op);
+
+ int load_metadata(int osd, std::map<std::string, std::string>& m,
+ std::ostream *err);
+ void count_metadata(const std::string& field, ceph::Formatter *f);
+
+ void reencode_incremental_map(ceph::buffer::list& bl, uint64_t features);
+ void reencode_full_map(ceph::buffer::list& bl, uint64_t features);
+public:
+ void count_metadata(const std::string& field, std::map<std::string,int> *out);
+ void get_versions(std::map<std::string, std::list<std::string>> &versions);
+protected:
+ int get_osd_objectstore_type(int osd, std::string *type);
+ bool is_pool_currently_all_bluestore(int64_t pool_id, const pg_pool_t &pool,
+ std::ostream *err);
+
+ // when we last received PG stats from each osd and the osd's osd_beacon_report_interval
+ std::map<int, std::pair<utime_t, int>> last_osd_report;
+ // TODO: use last_osd_report to store the osd report epochs, once we don't
+ // need to upgrade from pre-luminous releases.
+ std::map<int,epoch_t> osd_epochs;
+ LastEpochClean last_epoch_clean;
+ bool preprocess_beacon(MonOpRequestRef op);
+ bool prepare_beacon(MonOpRequestRef op);
+ epoch_t get_min_last_epoch_clean() const;
+
+ friend class C_UpdateCreatingPGs;
+ std::map<int, std::map<epoch_t, std::set<spg_t>>> creating_pgs_by_osd_epoch;
+ std::vector<pg_t> pending_created_pgs;
+ // the epoch when the pg mapping was calculated
+ epoch_t creating_pgs_epoch = 0;
+ creating_pgs_t creating_pgs;
+ mutable std::mutex creating_pgs_lock;
+
+ creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc,
+ const OSDMap& nextmap);
+ unsigned scan_for_creating_pgs(
+ const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
+ const mempool::osdmap::set<int64_t>& removed_pools,
+ utime_t modified,
+ creating_pgs_t* creating_pgs) const;
+ std::pair<int32_t, pg_t> get_parent_pg(pg_t pgid) const;
+ void update_creating_pgs();
+ void check_pg_creates_subs();
+ epoch_t send_pg_creates(int osd, Connection *con, epoch_t next) const;
+
+ int32_t _allocate_osd_id(int32_t* existing_id);
+
+ int get_grace_interval_threshold();
+ bool grace_interval_threshold_exceeded(int last_failed);
+ void set_default_laggy_params(int target_osd);
+
+ int parse_pgid(const cmdmap_t& cmdmap, std::stringstream &ss,
+ pg_t &pgid, std::optional<std::string> pgidstr = std::nullopt);
+public:
+ OSDMonitor(CephContext *cct, Monitor &mn, Paxos &p, const std::string& service_name);
+
+ void tick() override; // check state, take actions
+
+ bool preprocess_command(MonOpRequestRef op);
+ bool prepare_command(MonOpRequestRef op);
+ bool prepare_command_impl(MonOpRequestRef op, const cmdmap_t& cmdmap);
+
+ int validate_osd_create(
+ const int32_t id,
+ const uuid_d& uuid,
+ const bool check_osd_exists,
+ int32_t* existing_id,
+ std::stringstream& ss);
+ int prepare_command_osd_create(
+ const int32_t id,
+ const uuid_d& uuid,
+ int32_t* existing_id,
+ std::stringstream& ss);
+ void do_osd_create(const int32_t id, const uuid_d& uuid,
+ const std::string& device_class,
+ int32_t* new_id);
+ int prepare_command_osd_purge(MonOpRequestRef op, int32_t id, std::stringstream& ss);
+ int prepare_command_osd_destroy(MonOpRequestRef op, int32_t id, std::stringstream& ss);
+ int _prepare_command_osd_crush_remove(
+ CrushWrapper &newcrush,
+ int32_t id,
+ int32_t ancestor,
+ bool has_ancestor,
+ bool unlink_only);
+ void do_osd_crush_remove(CrushWrapper& newcrush);
+ int prepare_command_osd_crush_remove(
+ CrushWrapper &newcrush,
+ int32_t id,
+ int32_t ancestor,
+ bool has_ancestor,
+ bool unlink_only);
+ int prepare_command_osd_remove(int32_t id);
+ int prepare_command_osd_new(
+ MonOpRequestRef op,
+ const cmdmap_t& cmdmap,
+ const std::map<std::string,std::string>& secrets,
+ std::stringstream &ss,
+ ceph::Formatter *f);
+
+ int prepare_command_pool_set(const cmdmap_t& cmdmap,
+ std::stringstream& ss);
+
+ int prepare_command_pool_application(const std::string &prefix,
+ const cmdmap_t& cmdmap,
+ std::stringstream& ss);
+ int preprocess_command_pool_application(const std::string &prefix,
+ const cmdmap_t& cmdmap,
+ std::stringstream& ss,
+ bool *modified);
+ int _command_pool_application(const std::string &prefix,
+ const cmdmap_t& cmdmap,
+ std::stringstream& ss,
+ bool *modified,
+ bool preparing);
+
+ bool handle_osd_timeouts(const utime_t &now,
+ std::map<int, std::pair<utime_t, int>> &last_osd_report);
+
+ void send_latest(MonOpRequestRef op, epoch_t start=0);
+ void send_latest_now_nodelete(MonOpRequestRef op, epoch_t start=0) {
+ op->mark_osdmon_event(__func__);
+ send_incremental(op, start);
+ }
+
+ int get_version(version_t ver, ceph::buffer::list& bl) override;
+ int get_version(version_t ver, uint64_t feature, ceph::buffer::list& bl);
+
+ int get_version_full(version_t ver, uint64_t feature, ceph::buffer::list& bl);
+ int get_version_full(version_t ver, ceph::buffer::list& bl) override;
+ int get_inc(version_t ver, OSDMap::Incremental& inc);
+ int get_full_from_pinned_map(version_t ver, ceph::buffer::list& bl);
+
+ epoch_t blocklist(const entity_addrvec_t& av, utime_t until);
+ epoch_t blocklist(entity_addr_t a, utime_t until);
+
+ void dump_info(ceph::Formatter *f);
+ int dump_osd_metadata(int osd, ceph::Formatter *f, std::ostream *err);
+ void print_nodes(ceph::Formatter *f);
+
+ void check_osdmap_sub(Subscription *sub);
+ void check_pg_creates_sub(Subscription *sub);
+
+ void do_application_enable(int64_t pool_id, const std::string &app_name,
+ const std::string &app_key="",
+ const std::string &app_value="",
+ bool force=false);
+ void do_set_pool_opt(int64_t pool_id, pool_opts_t::key_t opt,
+ pool_opts_t::value_t);
+
+ void add_flag(int flag) {
+ if (!(osdmap.flags & flag)) {
+ if (pending_inc.new_flags < 0)
+ pending_inc.new_flags = osdmap.flags;
+ pending_inc.new_flags |= flag;
+ }
+ }
+
+ void remove_flag(int flag) {
+ if(osdmap.flags & flag) {
+ if (pending_inc.new_flags < 0)
+ pending_inc.new_flags = osdmap.flags;
+ pending_inc.new_flags &= ~flag;
+ }
+ }
+ void convert_pool_priorities(void);
+ /**
+ * Find the pools which are requested to be put into stretch mode,
+ * validate that they are allowed to be in stretch mode (eg, are replicated)
+ * and place copies of them in the pools set.
+ * This does not make any changes to the pools or state; it's just
+ * a safety-check-and-collect function.
+ */
+ void try_enable_stretch_mode_pools(std::stringstream& ss, bool *okay,
+ int *errcode,
+ std::set<pg_pool_t*>* pools,
+ const std::string& new_crush_rule);
+ /**
+ * Check validity of inputs and OSD/CRUSH state to
+ * engage stretch mode. Designed to be used with
+ * MonmapMonitor::try_enable_stretch_mode() where we call both twice,
+ * first with commit=false to validate.
+ * @param ss: a stringstream to write errors into
+ * @param okay: Filled to true if okay, false if validation fails
+ * @param errcode: filled with -errno if there's a problem
+ * @param commit: true if we should commit the change, false if just testing
+ * @param dividing_bucket: the bucket type (eg 'dc') that divides the cluster
+ * @param bucket_count: The number of buckets required in peering.
+ * Currently must be 2.
+ * @param pools: The pg_pool_ts which are being set to stretch mode (obtained
+ * from try_enable_stretch_mode_pools()).
+ * @param new_crush_rule: The crush rule to set the pools to.
+ */
+ void try_enable_stretch_mode(std::stringstream& ss, bool *okay,
+ int *errcode, bool commit,
+ const std::string& dividing_bucket,
+ uint32_t bucket_count,
+ const std::set<pg_pool_t*>& pools,
+ const std::string& new_crush_rule);
+ /**
+ * Check the input dead_buckets mapping (buckets->dead monitors) to see
+ * if the OSDs are also down. If so, fill in really_down_buckets and
+ * really_down_mons and return true; else return false.
+ */
+ bool check_for_dead_crush_zones(const std::map<std::string,std::set<std::string>>& dead_buckets,
+ std::set<int> *really_down_buckets,
+ std::set<std::string> *really_down_mons);
+ /**
+ * Set degraded mode in the OSDMap, adding the given dead buckets to the dead set
+ * and using the live_zones (should presently be size 1)
+ */
+ void trigger_degraded_stretch_mode(const std::set<int>& dead_buckets,
+ const std::set<std::string>& live_zones);
+ /**
+ * This is just to maintain stretch_recovery_triggered; below
+ */
+ void set_degraded_stretch_mode();
+ /**
+ * Set recovery stretch mode in the OSDMap, resetting pool size back to normal
+ */
+ void trigger_recovery_stretch_mode();
+ /**
+ * This is just to maintain stretch_recovery_triggered; below
+ */
+ void set_recovery_stretch_mode();
+ /**
+ * This is just to maintain stretch_recovery_triggered; below
+ */
+ void set_healthy_stretch_mode();
+ /**
+ * Tells the OSD there's a new pg digest, in case it's interested.
+ * (It's interested when in recovering stretch mode.)
+ */
+ void notify_new_pg_digest();
+ /**
+ * Check if we can exit recovery stretch mode and go back to normal.
+ * @param force If true, we will force the exit through once it is legal,
+ * without regard to the reported PG status.
+ */
+ void try_end_recovery_stretch_mode(bool force);
+ /**
+ * Sets the osdmap and pg_pool_t values back to healthy stretch mode status.
+ */
+ void trigger_healthy_stretch_mode();
+ /**
+ * Obtain the crush rule being used for stretch pools.
+ * Note that right now this is heuristic and simply selects the
+ * most-used rule on replicated stretch pools.
+ * @return the crush rule ID, or a negative errno
+ */
+ int get_replicated_stretch_crush_rule();
+private:
+ utime_t stretch_recovery_triggered; // what time we committed a switch to recovery mode
+};
+
+#endif
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
new file mode 100644
index 000000000..16fa71c84
--- /dev/null
+++ b/src/mon/PGMap.cc
@@ -0,0 +1,4101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/algorithm/string.hpp>
+
+#include "include/rados.h"
+#include "PGMap.h"
+
+#define dout_subsys ceph_subsys_mon
+#include "common/debug.h"
+#include "common/Clock.h"
+#include "common/Formatter.h"
+#include "global/global_context.h"
+#include "include/ceph_features.h"
+#include "include/stringify.h"
+
+#include "osd/osd_types.h"
+#include "osd/OSDMap.h"
+#include <boost/range/adaptor/reversed.hpp>
+
+#define dout_context g_ceph_context
+
+using std::list;
+using std::make_pair;
+using std::map;
+using std::pair;
+using std::ostream;
+using std::ostringstream;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::fixed_u_to_string;
+using ceph::common::cmd_getval;
+using ceph::common::cmd_getval_or;
+using ceph::common::cmd_putval;
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap);
+MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap);
+MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap);
+
+
+// ---------------------
+// PGMapDigest
+
+void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
+{
+ // NOTE: see PGMap::encode_digest
+ uint8_t v = 4;
+ assert(HAVE_FEATURE(features, SERVER_NAUTILUS));
+ ENCODE_START(v, 1, bl);
+ encode(num_pg, bl);
+ encode(num_pg_active, bl);
+ encode(num_pg_unknown, bl);
+ encode(num_osd, bl);
+ encode(pg_pool_sum, bl, features);
+ encode(pg_sum, bl, features);
+ encode(osd_sum, bl, features);
+ encode(num_pg_by_state, bl);
+ encode(num_pg_by_osd, bl);
+ encode(num_pg_by_pool, bl);
+ encode(osd_last_seq, bl);
+ encode(per_pool_sum_delta, bl, features);
+ encode(per_pool_sum_deltas_stamps, bl);
+ encode(pg_sum_delta, bl, features);
+ encode(stamp_delta, bl);
+ encode(avail_space_by_rule, bl);
+ encode(purged_snaps, bl);
+ encode(osd_sum_by_class, bl, features);
+ ENCODE_FINISH(bl);
+}
+
+void PGMapDigest::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(4, p);
+ assert(struct_v >= 4);
+ decode(num_pg, p);
+ decode(num_pg_active, p);
+ decode(num_pg_unknown, p);
+ decode(num_osd, p);
+ decode(pg_pool_sum, p);
+ decode(pg_sum, p);
+ decode(osd_sum, p);
+ decode(num_pg_by_state, p);
+ decode(num_pg_by_osd, p);
+ decode(num_pg_by_pool, p);
+ decode(osd_last_seq, p);
+ decode(per_pool_sum_delta, p);
+ decode(per_pool_sum_deltas_stamps, p);
+ decode(pg_sum_delta, p);
+ decode(stamp_delta, p);
+ decode(avail_space_by_rule, p);
+ decode(purged_snaps, p);
+ decode(osd_sum_by_class, p);
+ DECODE_FINISH(p);
+}
+
+void PGMapDigest::dump(ceph::Formatter *f) const
+{
+ f->dump_unsigned("num_pg", num_pg);
+ f->dump_unsigned("num_pg_active", num_pg_active);
+ f->dump_unsigned("num_pg_unknown", num_pg_unknown);
+ f->dump_unsigned("num_osd", num_osd);
+ f->dump_object("pool_sum", pg_sum);
+ f->dump_object("osd_sum", osd_sum);
+
+ f->open_object_section("osd_sum_by_class");
+ for (auto& i : osd_sum_by_class) {
+ f->dump_object(i.first.c_str(), i.second);
+ }
+ f->close_section();
+
+ f->open_array_section("pool_stats");
+ for (auto& p : pg_pool_sum) {
+ f->open_object_section("pool_stat");
+ f->dump_int("poolid", p.first);
+ auto q = num_pg_by_pool.find(p.first);
+ if (q != num_pg_by_pool.end())
+ f->dump_unsigned("num_pg", q->second);
+ p.second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("osd_stats");
+ int i = 0;
+ // TODO: this isn't really correct since we can dump non-existent OSDs
+ // I dunno what osd_last_seq is set to in that case...
+ for (auto& p : osd_last_seq) {
+ f->open_object_section("osd_stat");
+ f->dump_int("osd", i);
+ f->dump_unsigned("seq", p);
+ f->close_section();
+ ++i;
+ }
+ f->close_section();
+ f->open_array_section("num_pg_by_state");
+ for (auto& p : num_pg_by_state) {
+ f->open_object_section("count");
+ f->dump_string("state", pg_state_string(p.first));
+ f->dump_unsigned("num", p.second);
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("num_pg_by_osd");
+ for (auto& p : num_pg_by_osd) {
+ f->open_object_section("count");
+ f->dump_unsigned("osd", p.first);
+ f->dump_unsigned("num_primary_pg", p.second.primary);
+ f->dump_unsigned("num_acting_pg", p.second.acting);
+ f->dump_unsigned("num_up_not_acting_pg", p.second.up_not_acting);
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("purged_snaps");
+ for (auto& j : purged_snaps) {
+ f->open_object_section("pool");
+ f->dump_int("pool", j.first);
+ f->open_object_section("purged_snaps");
+ for (auto i = j.second.begin(); i != j.second.end(); ++i) {
+ f->open_object_section("interval");
+ f->dump_stream("start") << i.get_start();
+ f->dump_stream("length") << i.get_len();
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls)
+{
+ ls.push_back(new PGMapDigest);
+}
+
+inline std::string percentify(const float& a) {
+ std::stringstream ss;
+ if (a < 0.01)
+ ss << "0";
+ else
+ ss << std::fixed << std::setprecision(2) << a;
+ return ss.str();
+}
+
+void PGMapDigest::print_summary(ceph::Formatter *f, ostream *out) const
+{
+ if (f)
+ f->open_array_section("pgs_by_state");
+
+ // list is descending numeric order (by count)
+ std::multimap<int,uint64_t> state_by_count; // count -> state
+ for (auto p = num_pg_by_state.begin();
+ p != num_pg_by_state.end();
+ ++p) {
+ state_by_count.insert(make_pair(p->second, p->first));
+ }
+ if (f) {
+ for (auto p = state_by_count.rbegin();
+ p != state_by_count.rend();
+ ++p)
+ {
+ f->open_object_section("pgs_by_state_element");
+ f->dump_string("state_name", pg_state_string(p->second));
+ f->dump_unsigned("count", p->first);
+ f->close_section();
+ }
+ }
+ if (f)
+ f->close_section();
+
+ if (f) {
+ f->dump_unsigned("num_pgs", num_pg);
+ f->dump_unsigned("num_pools", pg_pool_sum.size());
+ f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects);
+ f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
+ f->dump_unsigned("bytes_used", osd_sum.statfs.get_used_raw());
+ f->dump_unsigned("bytes_avail", osd_sum.statfs.available);
+ f->dump_unsigned("bytes_total", osd_sum.statfs.total);
+ } else {
+ *out << " pools: " << pg_pool_sum.size() << " pools, "
+ << num_pg << " pgs\n";
+ *out << " objects: " << si_u_t(pg_sum.stats.sum.num_objects) << " objects, "
+ << byte_u_t(pg_sum.stats.sum.num_bytes) << "\n";
+ *out << " usage: "
+ << byte_u_t(osd_sum.statfs.get_used_raw()) << " used, "
+ << byte_u_t(osd_sum.statfs.available) << " / "
+ << byte_u_t(osd_sum.statfs.total) << " avail\n";
+ *out << " pgs: ";
+ }
+
+ bool pad = false;
+
+ if (num_pg_unknown > 0) {
+ float p = (float)num_pg_unknown / (float)num_pg;
+ if (f) {
+ f->dump_float("unknown_pgs_ratio", p);
+ } else {
+ char b[20];
+ snprintf(b, sizeof(b), "%.3lf", p * 100.0);
+ *out << b << "% pgs unknown\n";
+ pad = true;
+ }
+ }
+
+ int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
+ if (num_pg_inactive > 0) {
+ float p = (float)num_pg_inactive / (float)num_pg;
+ if (f) {
+ f->dump_float("inactive_pgs_ratio", p);
+ } else {
+ if (pad) {
+ *out << " ";
+ }
+ char b[20];
+ snprintf(b, sizeof(b), "%.3f", p * 100.0);
+ *out << b << "% pgs not active\n";
+ pad = true;
+ }
+ }
+
+ list<string> sl;
+ overall_recovery_summary(f, &sl);
+ if (!f && !sl.empty()) {
+ for (auto p = sl.begin(); p != sl.end(); ++p) {
+ if (pad) {
+ *out << " ";
+ }
+ *out << *p << "\n";
+ pad = true;
+ }
+ }
+ sl.clear();
+
+ if (!f) {
+ unsigned max_width = 1;
+ for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
+ {
+ std::stringstream ss;
+ ss << p->first;
+ max_width = std::max<size_t>(ss.str().size(), max_width);
+ }
+
+ for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
+ {
+ if (pad) {
+ *out << " ";
+ }
+ pad = true;
+ out->setf(std::ios::left);
+ *out << std::setw(max_width) << p->first
+ << " " << pg_state_string(p->second) << "\n";
+ out->unsetf(std::ios::left);
+ }
+ }
+
+ ostringstream ss_rec_io;
+ overall_recovery_rate_summary(f, &ss_rec_io);
+ ostringstream ss_client_io;
+ overall_client_io_rate_summary(f, &ss_client_io);
+ ostringstream ss_cache_io;
+ overall_cache_io_rate_summary(f, &ss_cache_io);
+
+ if (!f && (ss_client_io.str().length() || ss_rec_io.str().length()
+ || ss_cache_io.str().length())) {
+ *out << "\n \n";
+ *out << " io:\n";
+ }
+
+ if (!f && ss_client_io.str().length())
+ *out << " client: " << ss_client_io.str() << "\n";
+ if (!f && ss_rec_io.str().length())
+ *out << " recovery: " << ss_rec_io.str() << "\n";
+ if (!f && ss_cache_io.str().length())
+ *out << " cache: " << ss_cache_io.str() << "\n";
+}
+
+void PGMapDigest::print_oneline_summary(ceph::Formatter *f, ostream *out) const
+{
+ std::stringstream ss;
+
+ if (f)
+ f->open_array_section("num_pg_by_state");
+ for (auto p = num_pg_by_state.begin();
+ p != num_pg_by_state.end();
+ ++p) {
+ if (f) {
+ f->open_object_section("state");
+ f->dump_string("name", pg_state_string(p->first));
+ f->dump_unsigned("num", p->second);
+ f->close_section();
+ }
+ if (p != num_pg_by_state.begin())
+ ss << ", ";
+ ss << p->second << " " << pg_state_string(p->first);
+ }
+ if (f)
+ f->close_section();
+
+ string states = ss.str();
+ if (out)
+ *out << num_pg << " pgs: "
+ << states << "; "
+ << byte_u_t(pg_sum.stats.sum.num_bytes) << " data, "
+ << byte_u_t(osd_sum.statfs.get_used()) << " used, "
+ << byte_u_t(osd_sum.statfs.available) << " / "
+ << byte_u_t(osd_sum.statfs.total) << " avail";
+ if (f) {
+ f->dump_unsigned("num_pgs", num_pg);
+ f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
+ f->dump_int("total_bytes", osd_sum.statfs.total);
+ f->dump_int("total_avail_bytes", osd_sum.statfs.available);
+ f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
+ f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
+ }
+
+ // make non-negative; we can get negative values if osds send
+ // uncommitted stats and then "go backward" or if they are just
+ // buggy/wrong.
+ pool_stat_t pos_delta = pg_sum_delta;
+ pos_delta.floor(0);
+ if (pos_delta.stats.sum.num_rd ||
+ pos_delta.stats.sum.num_wr) {
+ if (out)
+ *out << "; ";
+ if (pos_delta.stats.sum.num_rd) {
+ int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
+ if (out)
+ *out << byte_u_t(rd) << "/s rd, ";
+ if (f)
+ f->dump_unsigned("read_bytes_sec", rd);
+ }
+ if (pos_delta.stats.sum.num_wr) {
+ int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
+ if (out)
+ *out << byte_u_t(wr) << "/s wr, ";
+ if (f)
+ f->dump_unsigned("write_bytes_sec", wr);
+ }
+ int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
+ if (out)
+ *out << si_u_t(iops) << " op/s";
+ if (f)
+ f->dump_unsigned("io_sec", iops);
+ }
+
+ list<string> sl;
+ overall_recovery_summary(f, &sl);
+ if (out)
+ for (auto p = sl.begin(); p != sl.end(); ++p)
+ *out << "; " << *p;
+ std::stringstream ssr;
+ overall_recovery_rate_summary(f, &ssr);
+ if (out && ssr.str().length())
+ *out << "; " << ssr.str() << " recovering";
+}
+
+void PGMapDigest::get_recovery_stats(
+ double *misplaced_ratio,
+ double *degraded_ratio,
+ double *inactive_pgs_ratio,
+ double *unknown_pgs_ratio) const
+{
+ if (pg_sum.stats.sum.num_objects_degraded &&
+ pg_sum.stats.sum.num_object_copies > 0) {
+ *degraded_ratio = (double)pg_sum.stats.sum.num_objects_degraded /
+ (double)pg_sum.stats.sum.num_object_copies;
+ } else {
+ *degraded_ratio = 0;
+ }
+ if (pg_sum.stats.sum.num_objects_misplaced &&
+ pg_sum.stats.sum.num_object_copies > 0) {
+ *misplaced_ratio = (double)pg_sum.stats.sum.num_objects_misplaced /
+ (double)pg_sum.stats.sum.num_object_copies;
+ } else {
+ *misplaced_ratio = 0;
+ }
+ if (num_pg > 0) {
+ int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
+ *inactive_pgs_ratio = (double)num_pg_inactive / (double)num_pg;
+ *unknown_pgs_ratio = (double)num_pg_unknown / (double)num_pg;
+ } else {
+ *inactive_pgs_ratio = 0;
+ *unknown_pgs_ratio = 0;
+ }
+}
+
+void PGMapDigest::recovery_summary(ceph::Formatter *f, list<string> *psl,
+ const pool_stat_t& pool_sum) const
+{
+ if (pool_sum.stats.sum.num_objects_degraded && pool_sum.stats.sum.num_object_copies > 0) {
+ double pc = (double)pool_sum.stats.sum.num_objects_degraded /
+ (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
+ char b[20];
+ snprintf(b, sizeof(b), "%.3lf", pc);
+ if (f) {
+ f->dump_unsigned("degraded_objects", pool_sum.stats.sum.num_objects_degraded);
+ f->dump_unsigned("degraded_total", pool_sum.stats.sum.num_object_copies);
+ f->dump_float("degraded_ratio", pc / 100.0);
+ } else {
+ ostringstream ss;
+ ss << pool_sum.stats.sum.num_objects_degraded
+ << "/" << pool_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
+ psl->push_back(ss.str());
+ }
+ }
+ if (pool_sum.stats.sum.num_objects_misplaced && pool_sum.stats.sum.num_object_copies > 0) {
+ double pc = (double)pool_sum.stats.sum.num_objects_misplaced /
+ (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
+ char b[20];
+ snprintf(b, sizeof(b), "%.3lf", pc);
+ if (f) {
+ f->dump_unsigned("misplaced_objects", pool_sum.stats.sum.num_objects_misplaced);
+ f->dump_unsigned("misplaced_total", pool_sum.stats.sum.num_object_copies);
+ f->dump_float("misplaced_ratio", pc / 100.0);
+ } else {
+ ostringstream ss;
+ ss << pool_sum.stats.sum.num_objects_misplaced
+ << "/" << pool_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
+ psl->push_back(ss.str());
+ }
+ }
+ if (pool_sum.stats.sum.num_objects_unfound && pool_sum.stats.sum.num_objects) {
+ double pc = (double)pool_sum.stats.sum.num_objects_unfound /
+ (double)pool_sum.stats.sum.num_objects * (double)100.0;
+ char b[20];
+ snprintf(b, sizeof(b), "%.3lf", pc);
+ if (f) {
+ f->dump_unsigned("unfound_objects", pool_sum.stats.sum.num_objects_unfound);
+ f->dump_unsigned("unfound_total", pool_sum.stats.sum.num_objects);
+ f->dump_float("unfound_ratio", pc / 100.0);
+ } else {
+ ostringstream ss;
+ ss << pool_sum.stats.sum.num_objects_unfound
+ << "/" << pool_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
+ psl->push_back(ss.str());
+ }
+ }
+}
+
+void PGMapDigest::recovery_rate_summary(ceph::Formatter *f, ostream *out,
+ const pool_stat_t& delta_sum,
+ utime_t delta_stamp) const
+{
+ // make non-negative; we can get negative values if osds send
+ // uncommitted stats and then "go backward" or if they are just
+ // buggy/wrong.
+ pool_stat_t pos_delta = delta_sum;
+ pos_delta.floor(0);
+ if (pos_delta.stats.sum.num_objects_recovered ||
+ pos_delta.stats.sum.num_bytes_recovered ||
+ pos_delta.stats.sum.num_keys_recovered) {
+ int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
+ int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
+ int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
+ if (f) {
+ f->dump_int("recovering_objects_per_sec", objps);
+ f->dump_int("recovering_bytes_per_sec", bps);
+ f->dump_int("recovering_keys_per_sec", kps);
+ f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered);
+ f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
+ f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
+ } else {
+ *out << byte_u_t(bps) << "/s";
+ if (pos_delta.stats.sum.num_keys_recovered)
+ *out << ", " << si_u_t(kps) << " keys/s";
+ *out << ", " << si_u_t(objps) << " objects/s";
+ }
+ }
+}
+
+void PGMapDigest::overall_recovery_rate_summary(ceph::Formatter *f, ostream *out) const
+{
+ recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
+}
+
+void PGMapDigest::overall_recovery_summary(ceph::Formatter *f, list<string> *psl) const
+{
+ recovery_summary(f, psl, pg_sum);
+}
+
+void PGMapDigest::pool_recovery_rate_summary(ceph::Formatter *f, ostream *out,
+ uint64_t poolid) const
+{
+ auto p = per_pool_sum_delta.find(poolid);
+ if (p == per_pool_sum_delta.end())
+ return;
+
+ auto ts = per_pool_sum_deltas_stamps.find(p->first);
+ ceph_assert(ts != per_pool_sum_deltas_stamps.end());
+ recovery_rate_summary(f, out, p->second.first, ts->second);
+}
+
+void PGMapDigest::pool_recovery_summary(ceph::Formatter *f, list<string> *psl,
+ uint64_t poolid) const
+{
+ auto p = pg_pool_sum.find(poolid);
+ if (p == pg_pool_sum.end())
+ return;
+
+ recovery_summary(f, psl, p->second);
+}
+
+void PGMapDigest::client_io_rate_summary(ceph::Formatter *f, ostream *out,
+ const pool_stat_t& delta_sum,
+ utime_t delta_stamp) const
+{
+ pool_stat_t pos_delta = delta_sum;
+ pos_delta.floor(0);
+ if (pos_delta.stats.sum.num_rd ||
+ pos_delta.stats.sum.num_wr) {
+ if (pos_delta.stats.sum.num_rd) {
+ int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
+ if (f) {
+ f->dump_int("read_bytes_sec", rd);
+ } else {
+ *out << byte_u_t(rd) << "/s rd, ";
+ }
+ }
+ if (pos_delta.stats.sum.num_wr) {
+ int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
+ if (f) {
+ f->dump_int("write_bytes_sec", wr);
+ } else {
+ *out << byte_u_t(wr) << "/s wr, ";
+ }
+ }
+ int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
+ int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;
+ if (f) {
+ f->dump_int("read_op_per_sec", iops_rd);
+ f->dump_int("write_op_per_sec", iops_wr);
+ } else {
+ *out << si_u_t(iops_rd) << " op/s rd, " << si_u_t(iops_wr) << " op/s wr";
+ }
+ }
+}
+
+void PGMapDigest::overall_client_io_rate_summary(ceph::Formatter *f, ostream *out) const
+{
+ client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
+}
+
+void PGMapDigest::pool_client_io_rate_summary(ceph::Formatter *f, ostream *out,
+ uint64_t poolid) const
+{
+ auto p = per_pool_sum_delta.find(poolid);
+ if (p == per_pool_sum_delta.end())
+ return;
+
+ auto ts = per_pool_sum_deltas_stamps.find(p->first);
+ ceph_assert(ts != per_pool_sum_deltas_stamps.end());
+ client_io_rate_summary(f, out, p->second.first, ts->second);
+}
+
+void PGMapDigest::cache_io_rate_summary(ceph::Formatter *f, ostream *out,
+ const pool_stat_t& delta_sum,
+ utime_t delta_stamp) const
+{
+ pool_stat_t pos_delta = delta_sum;
+ pos_delta.floor(0);
+ bool have_output = false;
+
+ if (pos_delta.stats.sum.num_flush) {
+ int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
+ if (f) {
+ f->dump_int("flush_bytes_sec", flush);
+ } else {
+ *out << byte_u_t(flush) << "/s flush";
+ have_output = true;
+ }
+ }
+ if (pos_delta.stats.sum.num_evict) {
+ int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
+ if (f) {
+ f->dump_int("evict_bytes_sec", evict);
+ } else {
+ if (have_output)
+ *out << ", ";
+ *out << byte_u_t(evict) << "/s evict";
+ have_output = true;
+ }
+ }
+ if (pos_delta.stats.sum.num_promote) {
+ int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
+ if (f) {
+ f->dump_int("promote_op_per_sec", promote);
+ } else {
+ if (have_output)
+ *out << ", ";
+ *out << si_u_t(promote) << " op/s promote";
+ have_output = true;
+ }
+ }
+ if (pos_delta.stats.sum.num_flush_mode_low) {
+ if (f) {
+ f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
+ } else {
+ if (have_output)
+ *out << ", ";
+ *out << si_u_t(pos_delta.stats.sum.num_flush_mode_low) << " PGs flushing";
+ have_output = true;
+ }
+ }
+ if (pos_delta.stats.sum.num_flush_mode_high) {
+ if (f) {
+ f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
+ } else {
+ if (have_output)
+ *out << ", ";
+ *out << si_u_t(pos_delta.stats.sum.num_flush_mode_high) << " PGs flushing (high)";
+ have_output = true;
+ }
+ }
+ if (pos_delta.stats.sum.num_evict_mode_some) {
+ if (f) {
+ f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
+ } else {
+ if (have_output)
+ *out << ", ";
+ *out << si_u_t(pos_delta.stats.sum.num_evict_mode_some) << " PGs evicting";
+ have_output = true;
+ }
+ }
+ if (pos_delta.stats.sum.num_evict_mode_full) {
+ if (f) {
+ f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
+ } else {
+ if (have_output)
+ *out << ", ";
+ *out << si_u_t(pos_delta.stats.sum.num_evict_mode_full) << " PGs evicting (full)";
+ }
+ }
+}
+
+void PGMapDigest::overall_cache_io_rate_summary(ceph::Formatter *f, ostream *out) const
+{
+ cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
+}
+
+void PGMapDigest::pool_cache_io_rate_summary(ceph::Formatter *f, ostream *out,
+ uint64_t poolid) const
+{
+ auto p = per_pool_sum_delta.find(poolid);
+ if (p == per_pool_sum_delta.end())
+ return;
+
+ auto ts = per_pool_sum_deltas_stamps.find(p->first);
+ ceph_assert(ts != per_pool_sum_deltas_stamps.end());
+ cache_io_rate_summary(f, out, p->second.first, ts->second);
+}
+
+ceph_statfs PGMapDigest::get_statfs(OSDMap &osdmap,
+ std::optional<int64_t> data_pool) const
+{
+ ceph_statfs statfs;
+ bool filter = false;
+ object_stat_sum_t sum;
+
+ if (data_pool) {
+ auto i = pg_pool_sum.find(*data_pool);
+ if (i != pg_pool_sum.end()) {
+ sum = i->second.stats.sum;
+ filter = true;
+ }
+ }
+
+ if (filter) {
+ statfs.kb_used = (sum.num_bytes >> 10);
+ statfs.kb_avail = get_pool_free_space(osdmap, *data_pool) >> 10;
+ statfs.num_objects = sum.num_objects;
+ statfs.kb = statfs.kb_used + statfs.kb_avail;
+ } else {
+ // these are in KB.
+ statfs.kb = osd_sum.statfs.kb();
+ statfs.kb_used = osd_sum.statfs.kb_used_raw();
+ statfs.kb_avail = osd_sum.statfs.kb_avail();
+ statfs.num_objects = pg_sum.stats.sum.num_objects;
+ }
+
+ return statfs;
+}
+
+void PGMapDigest::dump_pool_stats_full(
+ const OSDMap &osd_map,
+ stringstream *ss,
+ ceph::Formatter *f,
+ bool verbose) const
+{
+ TextTable tbl;
+
+ if (f) {
+ f->open_array_section("pools");
+ } else {
+ tbl.define_column("POOL", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("ID", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("PGS", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("STORED", TextTable::RIGHT, TextTable::RIGHT);
+ if (verbose) {
+ tbl.define_column("(DATA)", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("(OMAP)", TextTable::RIGHT, TextTable::RIGHT);
+ }
+ tbl.define_column("OBJECTS", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("USED", TextTable::RIGHT, TextTable::RIGHT);
+ if (verbose) {
+ tbl.define_column("(DATA)", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("(OMAP)", TextTable::RIGHT, TextTable::RIGHT);
+ }
+ tbl.define_column("%USED", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("MAX AVAIL", TextTable::RIGHT, TextTable::RIGHT);
+
+ if (verbose) {
+ tbl.define_column("QUOTA OBJECTS", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("QUOTA BYTES", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("DIRTY", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("USED COMPR", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("UNDER COMPR", TextTable::RIGHT, TextTable::RIGHT);
+ }
+ }
+
+ map<int,uint64_t> avail_by_rule;
+ for (auto p = osd_map.get_pools().begin();
+ p != osd_map.get_pools().end(); ++p) {
+ int64_t pool_id = p->first;
+ if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
+ continue;
+
+ const string& pool_name = osd_map.get_pool_name(pool_id);
+ auto pool_pg_num = osd_map.get_pg_num(pool_id);
+ const pool_stat_t &stat = pg_pool_sum.at(pool_id);
+
+ const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
+ int ruleno = pool->get_crush_rule();
+ int64_t avail;
+ if (avail_by_rule.count(ruleno) == 0) {
+ // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
+ avail = get_rule_avail(ruleno);
+ if (avail < 0)
+ avail = 0;
+ avail_by_rule[ruleno] = avail;
+ } else {
+ avail = avail_by_rule[ruleno];
+ }
+ if (f) {
+ f->open_object_section("pool");
+ f->dump_string("name", pool_name);
+ f->dump_int("id", pool_id);
+ f->open_object_section("stats");
+ } else {
+ tbl << pool_name
+ << pool_id
+ << pool_pg_num;
+ }
+ float raw_used_rate = osd_map.pool_raw_used_rate(pool_id);
+ bool per_pool = use_per_pool_stats();
+ bool per_pool_omap = use_per_pool_omap_stats();
+ dump_object_stat_sum(tbl, f, stat, avail, raw_used_rate, verbose, per_pool,
+ per_pool_omap, pool);
+ if (f) {
+ f->close_section(); // stats
+ f->close_section(); // pool
+ } else {
+ tbl << TextTable::endrow;
+ }
+ }
+ if (f)
+ f->close_section();
+ else {
+ ceph_assert(ss != nullptr);
+ *ss << "--- POOLS ---\n";
+ *ss << tbl;
+ }
+}
+
+void PGMapDigest::dump_cluster_stats(stringstream *ss,
+ ceph::Formatter *f,
+ bool verbose) const
+{
+ if (f) {
+ f->open_object_section("stats");
+ f->dump_int("total_bytes", osd_sum.statfs.total);
+ f->dump_int("total_avail_bytes", osd_sum.statfs.available);
+ f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
+ f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
+ f->dump_float("total_used_raw_ratio", osd_sum.statfs.get_used_raw_ratio());
+ f->dump_unsigned("num_osds", osd_sum.num_osds);
+ f->dump_unsigned("num_per_pool_osds", osd_sum.num_per_pool_osds);
+ f->dump_unsigned("num_per_pool_omap_osds", osd_sum.num_per_pool_omap_osds);
+ f->close_section();
+ f->open_object_section("stats_by_class");
+ for (auto& i : osd_sum_by_class) {
+ f->open_object_section(i.first.c_str());
+ f->dump_int("total_bytes", i.second.statfs.total);
+ f->dump_int("total_avail_bytes", i.second.statfs.available);
+ f->dump_int("total_used_bytes", i.second.statfs.get_used());
+ f->dump_int("total_used_raw_bytes", i.second.statfs.get_used_raw());
+ f->dump_float("total_used_raw_ratio",
+ i.second.statfs.get_used_raw_ratio());
+ f->close_section();
+ }
+ f->close_section();
+ } else {
+ ceph_assert(ss != nullptr);
+ TextTable tbl;
+ tbl.define_column("CLASS", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("SIZE", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("AVAIL", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("USED", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("RAW USED", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("%RAW USED", TextTable::RIGHT, TextTable::RIGHT);
+
+
+ for (auto& i : osd_sum_by_class) {
+ tbl << i.first;
+ tbl << stringify(byte_u_t(i.second.statfs.total))
+ << stringify(byte_u_t(i.second.statfs.available))
+ << stringify(byte_u_t(i.second.statfs.get_used()))
+ << stringify(byte_u_t(i.second.statfs.get_used_raw()))
+ << percentify(i.second.statfs.get_used_raw_ratio()*100.0)
+ << TextTable::endrow;
+ }
+ tbl << "TOTAL";
+ tbl << stringify(byte_u_t(osd_sum.statfs.total))
+ << stringify(byte_u_t(osd_sum.statfs.available))
+ << stringify(byte_u_t(osd_sum.statfs.get_used()))
+ << stringify(byte_u_t(osd_sum.statfs.get_used_raw()))
+ << percentify(osd_sum.statfs.get_used_raw_ratio()*100.0)
+ << TextTable::endrow;
+
+ *ss << "--- RAW STORAGE ---\n";
+ *ss << tbl;
+ }
+}
+
+void PGMapDigest::dump_object_stat_sum(
+ TextTable &tbl, ceph::Formatter *f,
+ const pool_stat_t &pool_stat, uint64_t avail,
+ float raw_used_rate, bool verbose, bool per_pool, bool per_pool_omap,
+ const pg_pool_t *pool)
+{
+ const object_stat_sum_t &sum = pool_stat.stats.sum;
+ const store_statfs_t statfs = pool_stat.store_stats;
+
+ if (sum.num_object_copies > 0) {
+ raw_used_rate *= (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
+ }
+
+ uint64_t used_data_bytes = pool_stat.get_allocated_data_bytes(per_pool);
+ uint64_t used_omap_bytes = pool_stat.get_allocated_omap_bytes(per_pool_omap);
+ uint64_t used_bytes = used_data_bytes + used_omap_bytes;
+
+ float used = 0.0;
+ // note avail passed in is raw_avail, calc raw_used here.
+ if (avail) {
+ used = used_bytes;
+ used /= used + avail;
+ } else if (used_bytes) {
+ used = 1.0;
+ }
+ auto avail_res = raw_used_rate ? avail / raw_used_rate : 0;
+ // an approximation for actually stored user data
+ auto stored_data_normalized = pool_stat.get_user_data_bytes(
+ raw_used_rate, per_pool);
+ auto stored_omap_normalized = pool_stat.get_user_omap_bytes(
+ raw_used_rate, per_pool_omap);
+ auto stored_normalized = stored_data_normalized + stored_omap_normalized;
+ // same, amplied by replication or EC
+ auto stored_raw = stored_normalized * raw_used_rate;
+ if (f) {
+ f->dump_int("stored", stored_normalized);
+ if (verbose) {
+ f->dump_int("stored_data", stored_data_normalized);
+ f->dump_int("stored_omap", stored_omap_normalized);
+ }
+ f->dump_int("objects", sum.num_objects);
+ f->dump_int("kb_used", shift_round_up(used_bytes, 10));
+ f->dump_int("bytes_used", used_bytes);
+ if (verbose) {
+ f->dump_int("data_bytes_used", used_data_bytes);
+ f->dump_int("omap_bytes_used", used_omap_bytes);
+ }
+ f->dump_float("percent_used", used);
+ f->dump_unsigned("max_avail", avail_res);
+ if (verbose) {
+ f->dump_int("quota_objects", pool->quota_max_objects);
+ f->dump_int("quota_bytes", pool->quota_max_bytes);
+ if (pool->is_tier()) {
+ f->dump_int("dirty", sum.num_objects_dirty);
+ } else {
+ f->dump_int("dirty", 0);
+ }
+ f->dump_int("rd", sum.num_rd);
+ f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
+ f->dump_int("wr", sum.num_wr);
+ f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
+ f->dump_int("compress_bytes_used", statfs.data_compressed_allocated);
+ f->dump_int("compress_under_bytes", statfs.data_compressed_original);
+ // Stored by user amplified by replication
+ f->dump_int("stored_raw", stored_raw);
+ f->dump_unsigned("avail_raw", avail);
+ }
+ } else {
+ tbl << stringify(byte_u_t(stored_normalized));
+ if (verbose) {
+ tbl << stringify(byte_u_t(stored_data_normalized));
+ tbl << stringify(byte_u_t(stored_omap_normalized));
+ }
+ tbl << stringify(si_u_t(sum.num_objects));
+ tbl << stringify(byte_u_t(used_bytes));
+ if (verbose) {
+ tbl << stringify(byte_u_t(used_data_bytes));
+ tbl << stringify(byte_u_t(used_omap_bytes));
+ }
+ tbl << percentify(used*100);
+ tbl << stringify(byte_u_t(avail_res));
+ if (verbose) {
+ if (pool->quota_max_objects == 0)
+ tbl << "N/A";
+ else
+ tbl << stringify(si_u_t(pool->quota_max_objects));
+ if (pool->quota_max_bytes == 0)
+ tbl << "N/A";
+ else
+ tbl << stringify(byte_u_t(pool->quota_max_bytes));
+ if (pool->is_tier()) {
+ tbl << stringify(si_u_t(sum.num_objects_dirty));
+ } else {
+ tbl << "N/A";
+ }
+ tbl << stringify(byte_u_t(statfs.data_compressed_allocated));
+ tbl << stringify(byte_u_t(statfs.data_compressed_original));
+ }
+ }
+}
+
+int64_t PGMapDigest::get_pool_free_space(const OSDMap &osd_map,
+ int64_t poolid) const
+{
+ const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
+ int ruleno = pool->get_crush_rule();
+ int64_t avail;
+ avail = get_rule_avail(ruleno);
+ if (avail < 0)
+ avail = 0;
+
+ return avail / osd_map.pool_raw_used_rate(poolid);
+}
+
+int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
+{
+ map<int,float> wm;
+ int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
+ if (r < 0) {
+ return r;
+ }
+ if (wm.empty()) {
+ return 0;
+ }
+
+ float fratio = osdmap.get_full_ratio();
+
+ int64_t min = -1;
+ for (auto p = wm.begin(); p != wm.end(); ++p) {
+ auto osd_info = osd_stat.find(p->first);
+ if (osd_info != osd_stat.end()) {
+ if (osd_info->second.statfs.total == 0 || p->second == 0) {
+ // osd must be out, hence its stats have been zeroed
+ // (unless we somehow managed to have a disk with size 0...)
+ //
+ // (p->second == 0), if osd weight is 0, no need to
+ // calculate proj below.
+ continue;
+ }
+ double unusable = (double)osd_info->second.statfs.kb() *
+ (1.0 - fratio);
+ double avail = std::max(0.0, (double)osd_info->second.statfs.kb_avail() - unusable);
+ avail *= 1024.0;
+ int64_t proj = (int64_t)(avail / (double)p->second);
+ if (min < 0 || proj < min) {
+ min = proj;
+ }
+ } else {
+ if (osdmap.is_up(p->first)) {
+ // This is a level 4 rather than an error, because we might have
+ // only just started, and not received the first stats message yet.
+ dout(4) << "OSD " << p->first << " is up, but has no stats" << dendl;
+ }
+ }
+ }
+ return min;
+}
+
+void PGMap::get_rules_avail(const OSDMap& osdmap,
+ std::map<int,int64_t> *avail_map) const
+{
+ avail_map->clear();
+ for (auto p : osdmap.get_pools()) {
+ int64_t pool_id = p.first;
+ if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
+ continue;
+ const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
+ int ruleno = pool->get_crush_rule();
+ if (avail_map->count(ruleno) == 0)
+ (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno);
+ }
+}
+
+// ---------------------
+// PGMap
+
+void PGMap::Incremental::dump(ceph::Formatter *f) const
+{
+ f->dump_unsigned("version", version);
+ f->dump_stream("stamp") << stamp;
+ f->dump_unsigned("osdmap_epoch", osdmap_epoch);
+ f->dump_unsigned("pg_scan_epoch", pg_scan);
+
+ f->open_array_section("pg_stat_updates");
+ for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) {
+ f->open_object_section("pg_stat");
+ f->dump_stream("pgid") << p->first;
+ p->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("osd_stat_updates");
+ for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) {
+ f->open_object_section("osd_stat");
+ f->dump_int("osd", p->first);
+ p->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("pool_statfs_updates");
+ for (auto p = pool_statfs_updates.begin(); p != pool_statfs_updates.end(); ++p) {
+ f->open_object_section("pool_statfs");
+ f->dump_stream("poolid/osd") << p->first;
+ p->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("osd_stat_removals");
+ for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p)
+ f->dump_int("osd", *p);
+ f->close_section();
+
+ f->open_array_section("pg_removals");
+ for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p)
+ f->dump_stream("pgid") << *p;
+ f->close_section();
+}
+
+void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
+{
+ o.push_back(new Incremental);
+ o.push_back(new Incremental);
+ o.back()->version = 1;
+ o.back()->stamp = utime_t(123,345);
+ o.push_back(new Incremental);
+ o.back()->version = 2;
+ o.back()->pg_stat_updates[pg_t(1,2)] = pg_stat_t();
+ o.back()->osd_stat_updates[5] = osd_stat_t();
+ o.push_back(new Incremental);
+ o.back()->version = 3;
+ o.back()->osdmap_epoch = 1;
+ o.back()->pg_scan = 2;
+ o.back()->pg_stat_updates[pg_t(4,5)] = pg_stat_t();
+ o.back()->osd_stat_updates[6] = osd_stat_t();
+ o.back()->pg_remove.insert(pg_t(1,2));
+ o.back()->osd_stat_rm.insert(5);
+ o.back()->pool_statfs_updates[std::make_pair(1234,4)] = store_statfs_t();
+}
+
+// --
+
+void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
+{
+ ceph_assert(inc.version == version+1);
+ version++;
+
+ pool_stat_t pg_sum_old = pg_sum;
+ mempool::pgmap::unordered_map<int32_t, pool_stat_t> pg_pool_sum_old;
+ pg_pool_sum_old = pg_pool_sum;
+
+ for (auto p = inc.pg_stat_updates.begin();
+ p != inc.pg_stat_updates.end();
+ ++p) {
+ const pg_t &update_pg(p->first);
+ auto update_pool = update_pg.pool();
+ const pg_stat_t &update_stat(p->second);
+
+ auto pg_stat_iter = pg_stat.find(update_pg);
+ pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
+ if (pg_stat_iter == pg_stat.end()) {
+ pg_stat.insert(make_pair(update_pg, update_stat));
+ } else {
+ stat_pg_sub(update_pg, pg_stat_iter->second);
+ pool_sum_ref.sub(pg_stat_iter->second);
+ pg_stat_iter->second = update_stat;
+ }
+ stat_pg_add(update_pg, update_stat);
+ pool_sum_ref.add(update_stat);
+ }
+
+ for (auto p = inc.pool_statfs_updates.begin();
+ p != inc.pool_statfs_updates.end();
+ ++p) {
+ auto update_pool = p->first.first;
+ auto update_osd = p->first.second;
+ auto& statfs_inc = p->second;
+
+ auto pool_statfs_iter =
+ pool_statfs.find(std::make_pair(update_pool, update_osd));
+ if (pg_pool_sum.count(update_pool)) {
+ pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
+ if (pool_statfs_iter == pool_statfs.end()) {
+ pool_statfs.emplace(std::make_pair(update_pool, update_osd), statfs_inc);
+ } else {
+ pool_sum_ref.sub(pool_statfs_iter->second);
+ pool_statfs_iter->second = statfs_inc;
+ }
+ pool_sum_ref.add(statfs_inc);
+ }
+ }
+
+ for (auto p = inc.get_osd_stat_updates().begin();
+ p != inc.get_osd_stat_updates().end();
+ ++p) {
+ int osd = p->first;
+ const osd_stat_t &new_stats(p->second);
+
+ auto t = osd_stat.find(osd);
+ if (t == osd_stat.end()) {
+ osd_stat.insert(make_pair(osd, new_stats));
+ } else {
+ stat_osd_sub(t->first, t->second);
+ t->second = new_stats;
+ }
+ stat_osd_add(osd, new_stats);
+ }
+ set<int64_t> deleted_pools;
+ for (auto p = inc.pg_remove.begin();
+ p != inc.pg_remove.end();
+ ++p) {
+ const pg_t &removed_pg(*p);
+ auto s = pg_stat.find(removed_pg);
+ bool pool_erased = false;
+ if (s != pg_stat.end()) {
+ pool_erased = stat_pg_sub(removed_pg, s->second);
+
+ // decrease pool stats if pg was removed
+ auto pool_stats_it = pg_pool_sum.find(removed_pg.pool());
+ if (pool_stats_it != pg_pool_sum.end()) {
+ pool_stats_it->second.sub(s->second);
+ }
+
+ pg_stat.erase(s);
+ if (pool_erased) {
+ deleted_pools.insert(removed_pg.pool());
+ }
+ }
+ }
+
+ for (auto p = inc.get_osd_stat_rm().begin();
+ p != inc.get_osd_stat_rm().end();
+ ++p) {
+ auto t = osd_stat.find(*p);
+ if (t != osd_stat.end()) {
+ stat_osd_sub(t->first, t->second);
+ osd_stat.erase(t);
+ }
+ for (auto i = pool_statfs.begin(); i != pool_statfs.end();) {
+ if (i->first.second == *p) {
+ pg_pool_sum[i->first.first].sub(i->second);
+ i = pool_statfs.erase(i);
+ } else {
+ ++i;
+ }
+ }
+ }
+
+ // skip calculating delta while sum was not synchronized
+ if (!stamp.is_zero() && !pg_sum_old.stats.sum.is_zero()) {
+ utime_t delta_t;
+ delta_t = inc.stamp;
+ delta_t -= stamp;
+ // calculate a delta, and average over the last 2 deltas.
+ pool_stat_t d = pg_sum;
+ d.stats.sub(pg_sum_old.stats);
+ pg_sum_deltas.push_back(make_pair(d, delta_t));
+ stamp_delta += delta_t;
+ pg_sum_delta.stats.add(d.stats);
+ auto smooth_intervals =
+ cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
+ while (pg_sum_deltas.size() > smooth_intervals) {
+ pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
+ stamp_delta -= pg_sum_deltas.front().second;
+ pg_sum_deltas.pop_front();
+ }
+ }
+ stamp = inc.stamp;
+
+ update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
+
+ for (auto p : deleted_pools) {
+ if (cct)
+ dout(20) << " deleted pool " << p << dendl;
+ deleted_pool(p);
+ }
+
+ if (inc.osdmap_epoch)
+ last_osdmap_epoch = inc.osdmap_epoch;
+ if (inc.pg_scan)
+ last_pg_scan = inc.pg_scan;
+}
+
+void PGMap::calc_stats()
+{
+ num_pg = 0;
+ num_pg_active = 0;
+ num_pg_unknown = 0;
+ num_osd = 0;
+ pg_pool_sum.clear();
+ num_pg_by_pool.clear();
+ pg_by_osd.clear();
+ pg_sum = pool_stat_t();
+ osd_sum = osd_stat_t();
+ osd_sum_by_class.clear();
+ num_pg_by_state.clear();
+ num_pg_by_pool_state.clear();
+ num_pg_by_osd.clear();
+
+ for (auto p = pg_stat.begin();
+ p != pg_stat.end();
+ ++p) {
+ auto pg = p->first;
+ stat_pg_add(pg, p->second);
+ pg_pool_sum[pg.pool()].add(p->second);
+ }
+ for (auto p = pool_statfs.begin();
+ p != pool_statfs.end();
+ ++p) {
+ auto pool = p->first.first;
+ pg_pool_sum[pool].add(p->second);
+ }
+ for (auto p = osd_stat.begin();
+ p != osd_stat.end();
+ ++p)
+ stat_osd_add(p->first, p->second);
+}
+
+void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
+ bool sameosds)
+{
+ auto pool = pgid.pool();
+ pg_sum.add(s);
+
+ num_pg++;
+ num_pg_by_state[s.state]++;
+ num_pg_by_pool_state[pgid.pool()][s.state]++;
+ num_pg_by_pool[pool]++;
+
+ if ((s.state & PG_STATE_CREATING) &&
+ s.parent_split_bits == 0) {
+ creating_pgs.insert(pgid);
+ if (s.acting_primary >= 0) {
+ creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
+ }
+ }
+
+ if (s.state & PG_STATE_ACTIVE) {
+ ++num_pg_active;
+ }
+ if (s.state == 0) {
+ ++num_pg_unknown;
+ }
+
+ if (sameosds)
+ return;
+
+ for (auto p = s.blocked_by.begin();
+ p != s.blocked_by.end();
+ ++p) {
+ ++blocked_by_sum[*p];
+ }
+
+ for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
+ pg_by_osd[*p].insert(pgid);
+ num_pg_by_osd[*p].acting++;
+ }
+ for (auto p = s.up.begin(); p != s.up.end(); ++p) {
+ auto& t = pg_by_osd[*p];
+ if (t.find(pgid) == t.end()) {
+ t.insert(pgid);
+ num_pg_by_osd[*p].up_not_acting++;
+ }
+ }
+
+ if (s.up_primary >= 0) {
+ num_pg_by_osd[s.up_primary].primary++;
+ }
+}
+
+bool PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
+ bool sameosds)
+{
+ bool pool_erased = false;
+ pg_sum.sub(s);
+
+ num_pg--;
+ int end = --num_pg_by_state[s.state];
+ ceph_assert(end >= 0);
+ if (end == 0)
+ num_pg_by_state.erase(s.state);
+ if (--num_pg_by_pool_state[pgid.pool()][s.state] == 0) {
+ num_pg_by_pool_state[pgid.pool()].erase(s.state);
+ }
+ end = --num_pg_by_pool[pgid.pool()];
+ if (end == 0) {
+ pool_erased = true;
+ }
+
+ if ((s.state & PG_STATE_CREATING) &&
+ s.parent_split_bits == 0) {
+ creating_pgs.erase(pgid);
+ if (s.acting_primary >= 0) {
+ map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
+ r[s.mapping_epoch].erase(pgid);
+ if (r[s.mapping_epoch].empty())
+ r.erase(s.mapping_epoch);
+ if (r.empty())
+ creating_pgs_by_osd_epoch.erase(s.acting_primary);
+ }
+ }
+
+ if (s.state & PG_STATE_ACTIVE) {
+ --num_pg_active;
+ }
+ if (s.state == 0) {
+ --num_pg_unknown;
+ }
+
+ if (sameosds)
+ return pool_erased;
+
+ for (auto p = s.blocked_by.begin();
+ p != s.blocked_by.end();
+ ++p) {
+ auto q = blocked_by_sum.find(*p);
+ ceph_assert(q != blocked_by_sum.end());
+ --q->second;
+ if (q->second == 0)
+ blocked_by_sum.erase(q);
+ }
+
+ set<int32_t> actingset;
+ for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
+ actingset.insert(*p);
+ auto& oset = pg_by_osd[*p];
+ oset.erase(pgid);
+ if (oset.empty())
+ pg_by_osd.erase(*p);
+ auto it = num_pg_by_osd.find(*p);
+ if (it != num_pg_by_osd.end() && it->second.acting > 0)
+ it->second.acting--;
+ }
+ for (auto p = s.up.begin(); p != s.up.end(); ++p) {
+ auto& oset = pg_by_osd[*p];
+ oset.erase(pgid);
+ if (oset.empty())
+ pg_by_osd.erase(*p);
+ if (actingset.count(*p))
+ continue;
+ auto it = num_pg_by_osd.find(*p);
+ if (it != num_pg_by_osd.end() && it->second.up_not_acting > 0)
+ it->second.up_not_acting--;
+ }
+
+ if (s.up_primary >= 0) {
+ auto it = num_pg_by_osd.find(s.up_primary);
+ if (it != num_pg_by_osd.end() && it->second.primary > 0)
+ it->second.primary--;
+ }
+ return pool_erased;
+}
+
+void PGMap::calc_purged_snaps()
+{
+ purged_snaps.clear();
+ set<int64_t> unknown;
+ for (auto& i : pg_stat) {
+ if (i.second.state == 0) {
+ unknown.insert(i.first.pool());
+ purged_snaps.erase(i.first.pool());
+ continue;
+ } else if (unknown.count(i.first.pool())) {
+ continue;
+ }
+ auto j = purged_snaps.find(i.first.pool());
+ if (j == purged_snaps.end()) {
+ // base case
+ purged_snaps[i.first.pool()] = i.second.purged_snaps;
+ } else {
+ j->second.intersection_of(i.second.purged_snaps);
+ }
+ }
+}
+
+void PGMap::calc_osd_sum_by_class(const OSDMap& osdmap)
+{
+ osd_sum_by_class.clear();
+ for (auto& i : osd_stat) {
+ const char *class_name = osdmap.crush->get_item_class(i.first);
+ if (class_name) {
+ osd_sum_by_class[class_name].add(i.second);
+ }
+ }
+}
+
+void PGMap::stat_osd_add(int osd, const osd_stat_t &s)
+{
+ num_osd++;
+ osd_sum.add(s);
+ if (osd >= (int)osd_last_seq.size()) {
+ osd_last_seq.resize(osd + 1);
+ }
+ osd_last_seq[osd] = s.seq;
+}
+
+void PGMap::stat_osd_sub(int osd, const osd_stat_t &s)
+{
+ num_osd--;
+ osd_sum.sub(s);
+ ceph_assert(osd < (int)osd_last_seq.size());
+ osd_last_seq[osd] = 0;
+}
+
+void PGMap::encode_digest(const OSDMap& osdmap,
+ bufferlist& bl, uint64_t features)
+{
+ get_rules_avail(osdmap, &avail_space_by_rule);
+ calc_osd_sum_by_class(osdmap);
+ calc_purged_snaps();
+ PGMapDigest::encode(bl, features);
+}
+
+void PGMap::encode(bufferlist &bl, uint64_t features) const
+{
+ ENCODE_START(8, 8, bl);
+ encode(version, bl);
+ encode(pg_stat, bl);
+ encode(osd_stat, bl, features);
+ encode(last_osdmap_epoch, bl);
+ encode(last_pg_scan, bl);
+ encode(stamp, bl);
+ encode(pool_statfs, bl, features);
+ ENCODE_FINISH(bl);
+}
+
+void PGMap::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START(8, bl);
+ decode(version, bl);
+ decode(pg_stat, bl);
+ decode(osd_stat, bl);
+ decode(last_osdmap_epoch, bl);
+ decode(last_pg_scan, bl);
+ decode(stamp, bl);
+ decode(pool_statfs, bl);
+ DECODE_FINISH(bl);
+
+ calc_stats();
+}
+
+void PGMap::dump(ceph::Formatter *f, bool with_net) const
+{
+ dump_basic(f);
+ dump_pg_stats(f, false);
+ dump_pool_stats(f);
+ dump_osd_stats(f, with_net);
+}
+
+void PGMap::dump_basic(ceph::Formatter *f) const
+{
+ f->dump_unsigned("version", version);
+ f->dump_stream("stamp") << stamp;
+ f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
+ f->dump_unsigned("last_pg_scan", last_pg_scan);
+
+ f->open_object_section("pg_stats_sum");
+ pg_sum.dump(f);
+ f->close_section();
+
+ f->open_object_section("osd_stats_sum");
+ osd_sum.dump(f);
+ f->close_section();
+
+ dump_delta(f);
+}
+
+void PGMap::dump_delta(ceph::Formatter *f) const
+{
+ f->open_object_section("pg_stats_delta");
+ pg_sum_delta.dump(f);
+ f->dump_stream("stamp_delta") << stamp_delta;
+ f->close_section();
+}
+
+void PGMap::dump_pg_stats(ceph::Formatter *f, bool brief) const
+{
+ f->open_array_section("pg_stats");
+ for (auto i = pg_stat.begin();
+ i != pg_stat.end();
+ ++i) {
+ f->open_object_section("pg_stat");
+ f->dump_stream("pgid") << i->first;
+ if (brief)
+ i->second.dump_brief(f);
+ else
+ i->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void PGMap::dump_pg_progress(ceph::Formatter *f) const
+{
+ f->open_object_section("pgs");
+ for (auto& i : pg_stat) {
+ std::string n = stringify(i.first);
+ f->open_object_section(n.c_str());
+ f->dump_int("num_bytes_recovered", i.second.stats.sum.num_bytes_recovered);
+ f->dump_int("num_bytes", i.second.stats.sum.num_bytes);
+ f->dump_unsigned("reported_epoch", i.second.reported_epoch);
+ f->dump_string("state", pg_state_string(i.second.state));
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void PGMap::dump_pool_stats(ceph::Formatter *f) const
+{
+ f->open_array_section("pool_stats");
+ for (auto p = pg_pool_sum.begin();
+ p != pg_pool_sum.end();
+ ++p) {
+ f->open_object_section("pool_stat");
+ f->dump_int("poolid", p->first);
+ auto q = num_pg_by_pool.find(p->first);
+ if (q != num_pg_by_pool.end())
+ f->dump_unsigned("num_pg", q->second);
+ p->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void PGMap::dump_osd_stats(ceph::Formatter *f, bool with_net) const
+{
+ f->open_array_section("osd_stats");
+ for (auto q = osd_stat.begin();
+ q != osd_stat.end();
+ ++q) {
+ f->open_object_section("osd_stat");
+ f->dump_int("osd", q->first);
+ q->second.dump(f, with_net);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("pool_statfs");
+ for (auto& p : pool_statfs) {
+ f->open_object_section("item");
+ f->dump_int("poolid", p.first.first);
+ f->dump_int("osd", p.first.second);
+ p.second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void PGMap::dump_osd_ping_times(ceph::Formatter *f) const
+{
+ f->open_array_section("osd_ping_times");
+ for (const auto& [osd, stat] : osd_stat) {
+ f->open_object_section("osd_ping_time");
+ f->dump_int("osd", osd);
+ stat.dump_ping_time(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+// note: dump_pg_stats_plain() is static
+void PGMap::dump_pg_stats_plain(
+ ostream& ss,
+ const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
+ bool brief)
+{
+ TextTable tab;
+
+ if (brief){
+ tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
+ tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
+ }
+ else {
+ tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
+ tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("LOG_DUPS", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("SNAPTRIMQ_LEN", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("LAST_SCRUB_DURATION", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("SCRUB_SCHEDULING", TextTable::LEFT, TextTable::LEFT);
+ tab.define_column("OBJECTS_SCRUBBED", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("OBJECTS_TRIMMED", TextTable::LEFT, TextTable::RIGHT);
+ }
+
+ for (const auto& [pg, st] : pg_stats) {
+ if (brief) {
+ tab << pg
+ << pg_state_string(st.state)
+ << st.up
+ << st.up_primary
+ << st.acting
+ << st.acting_primary
+ << TextTable::endrow;
+ } else {
+ ostringstream reported;
+ reported << st.reported_epoch << ":" << st.reported_seq;
+
+ tab << pg
+ << st.stats.sum.num_objects
+ << st.stats.sum.num_objects_missing_on_primary
+ << st.stats.sum.num_objects_degraded
+ << st.stats.sum.num_objects_misplaced
+ << st.stats.sum.num_objects_unfound
+ << st.stats.sum.num_bytes
+ << st.stats.sum.num_omap_bytes
+ << st.stats.sum.num_omap_keys
+ << st.log_size
+ << st.log_dups_size
+ << st.ondisk_log_size
+ << pg_state_string(st.state)
+ << st.last_change
+ << st.version
+ << reported.str()
+ << pg_vector_string(st.up)
+ << st.up_primary
+ << pg_vector_string(st.acting)
+ << st.acting_primary
+ << st.last_scrub
+ << st.last_scrub_stamp
+ << st.last_deep_scrub
+ << st.last_deep_scrub_stamp
+ << st.snaptrimq_len
+ << st.last_scrub_duration
+ << st.dump_scrub_schedule()
+ << st.objects_scrubbed
+ << st.objects_trimmed
+ << TextTable::endrow;
+ }
+ }
+
+ ss << tab;
+}
+
+void PGMap::dump(ostream& ss) const
+{
+ dump_basic(ss);
+ dump_pg_stats(ss, false);
+ dump_pool_stats(ss, false);
+ dump_pg_sum_stats(ss, false);
+ dump_osd_stats(ss);
+}
+
+void PGMap::dump_basic(ostream& ss) const
+{
+ ss << "version " << version << std::endl;
+ ss << "stamp " << stamp << std::endl;
+ ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl;
+ ss << "last_pg_scan " << last_pg_scan << std::endl;
+}
+
+void PGMap::dump_pg_stats(ostream& ss, bool brief) const
+{
+ dump_pg_stats_plain(ss, pg_stat, brief);
+}
+
+void PGMap::dump_pool_stats(ostream& ss, bool header) const
+{
+ TextTable tab;
+
+ if (header) {
+ tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT);
+ tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
+ } else {
+ tab.define_column("", TextTable::LEFT, TextTable::LEFT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ }
+
+ for (auto p = pg_pool_sum.begin();
+ p != pg_pool_sum.end();
+ ++p) {
+ tab << p->first
+ << p->second.stats.sum.num_objects
+ << p->second.stats.sum.num_objects_missing_on_primary
+ << p->second.stats.sum.num_objects_degraded
+ << p->second.stats.sum.num_objects_misplaced
+ << p->second.stats.sum.num_objects_unfound
+ << p->second.stats.sum.num_bytes
+ << p->second.stats.sum.num_omap_bytes
+ << p->second.stats.sum.num_omap_keys
+ << p->second.log_size
+ << p->second.ondisk_log_size
+ << TextTable::endrow;
+ }
+
+ ss << tab;
+}
+
+void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const
+{
+ TextTable tab;
+
+ if (header) {
+ tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
+ tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
+ } else {
+ tab.define_column("", TextTable::LEFT, TextTable::LEFT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+ };
+
+ tab << "sum"
+ << pg_sum.stats.sum.num_objects
+ << pg_sum.stats.sum.num_objects_missing_on_primary
+ << pg_sum.stats.sum.num_objects_degraded
+ << pg_sum.stats.sum.num_objects_misplaced
+ << pg_sum.stats.sum.num_objects_unfound
+ << pg_sum.stats.sum.num_bytes
+ << pg_sum.stats.sum.num_omap_bytes
+ << pg_sum.stats.sum.num_omap_keys
+ << pg_sum.log_size
+ << pg_sum.ondisk_log_size
+ << TextTable::endrow;
+
+ ss << tab;
+}
+
+void PGMap::dump_osd_stats(ostream& ss) const
+{
+ TextTable tab;
+
+ tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
+ tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
+
+ for (auto p = osd_stat.begin();
+ p != osd_stat.end();
+ ++p) {
+ tab << p->first
+ << byte_u_t(p->second.statfs.get_used())
+ << byte_u_t(p->second.statfs.available)
+ << byte_u_t(p->second.statfs.get_used_raw())
+ << byte_u_t(p->second.statfs.total)
+ << p->second.hb_peers
+ << get_num_pg_by_osd(p->first)
+ << get_num_primary_pg_by_osd(p->first)
+ << TextTable::endrow;
+ }
+
+ tab << "sum"
+ << byte_u_t(osd_sum.statfs.get_used())
+ << byte_u_t(osd_sum.statfs.available)
+ << byte_u_t(osd_sum.statfs.get_used_raw())
+ << byte_u_t(osd_sum.statfs.total)
+ << TextTable::endrow;
+
+ ss << tab;
+}
+
+void PGMap::dump_osd_sum_stats(ostream& ss) const
+{
+ TextTable tab;
+
+ tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
+ tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
+
+ tab << "sum"
+ << byte_u_t(osd_sum.statfs.get_used())
+ << byte_u_t(osd_sum.statfs.available)
+ << byte_u_t(osd_sum.statfs.get_used_raw())
+ << byte_u_t(osd_sum.statfs.total)
+ << TextTable::endrow;
+
+ ss << tab;
+}
+
+void PGMap::get_stuck_stats(
+ int types, const utime_t cutoff,
+ mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
+{
+ ceph_assert(types != 0);
+ for (auto i = pg_stat.begin();
+ i != pg_stat.end();
+ ++i) {
+ utime_t val = cutoff; // don't care about >= cutoff so that is infinity
+
+ if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) {
+ if (i->second.last_active < val)
+ val = i->second.last_active;
+ }
+
+ if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) {
+ if (i->second.last_clean < val)
+ val = i->second.last_clean;
+ }
+
+ if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) {
+ if (i->second.last_undegraded < val)
+ val = i->second.last_undegraded;
+ }
+
+ if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) {
+ if (i->second.last_fullsized < val)
+ val = i->second.last_fullsized;
+ }
+
+ if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) {
+ if (i->second.last_unstale < val)
+ val = i->second.last_unstale;
+ }
+
+ if ((types & STUCK_PEERING) && (i->second.state & PG_STATE_PEERING)) {
+ if (i->second.last_peered < val)
+ val = i->second.last_peered;
+ }
+ // val is now the earliest any of the requested stuck states began
+ if (val < cutoff) {
+ stuck_pgs[i->first] = i->second;
+ }
+ }
+}
+
+void PGMap::dump_stuck(ceph::Formatter *f, int types, utime_t cutoff) const
+{
+ mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
+ get_stuck_stats(types, cutoff, stuck_pg_stats);
+ f->open_array_section("stuck_pg_stats");
+ for (auto i = stuck_pg_stats.begin();
+ i != stuck_pg_stats.end();
+ ++i) {
+ f->open_object_section("pg_stat");
+ f->dump_stream("pgid") << i->first;
+ i->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const
+{
+ mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
+ get_stuck_stats(types, cutoff, stuck_pg_stats);
+ if (!stuck_pg_stats.empty())
+ dump_pg_stats_plain(ss, stuck_pg_stats, true);
+}
+
+int PGMap::dump_stuck_pg_stats(
+ stringstream &ds,
+ ceph::Formatter *f,
+ int threshold,
+ vector<string>& args) const
+{
+ int stuck_types = 0;
+
+ for (auto i = args.begin(); i != args.end(); ++i) {
+ if (*i == "inactive")
+ stuck_types |= PGMap::STUCK_INACTIVE;
+ else if (*i == "unclean")
+ stuck_types |= PGMap::STUCK_UNCLEAN;
+ else if (*i == "undersized")
+ stuck_types |= PGMap::STUCK_UNDERSIZED;
+ else if (*i == "degraded")
+ stuck_types |= PGMap::STUCK_DEGRADED;
+ else if (*i == "stale")
+ stuck_types |= PGMap::STUCK_STALE;
+ else if (*i == "peering")
+ stuck_types |= PGMap::STUCK_PEERING;
+ else {
+ ds << "Unknown type: " << *i << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ utime_t now(ceph_clock_now());
+ utime_t cutoff = now - utime_t(threshold, 0);
+
+ if (!f) {
+ dump_stuck_plain(ds, stuck_types, cutoff);
+ } else {
+ dump_stuck(f, stuck_types, cutoff);
+ f->flush(ds);
+ }
+
+ return 0;
+}
+
+void PGMap::dump_osd_perf_stats(ceph::Formatter *f) const
+{
+ f->open_array_section("osd_perf_infos");
+ for (auto i = osd_stat.begin();
+ i != osd_stat.end();
+ ++i) {
+ f->open_object_section("osd");
+ f->dump_int("id", i->first);
+ {
+ f->open_object_section("perf_stats");
+ i->second.os_perf_stat.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+}
+void PGMap::print_osd_perf_stats(std::ostream *ss) const
+{
+ TextTable tab;
+ tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
+ for (auto i = osd_stat.begin();
+ i != osd_stat.end();
+ ++i) {
+ tab << i->first;
+ tab << i->second.os_perf_stat.os_commit_latency_ns / 1000000ull;
+ tab << i->second.os_perf_stat.os_apply_latency_ns / 1000000ull;
+ tab << TextTable::endrow;
+ }
+ (*ss) << tab;
+}
+
+void PGMap::dump_osd_blocked_by_stats(ceph::Formatter *f) const
+{
+ f->open_array_section("osd_blocked_by_infos");
+ for (auto i = blocked_by_sum.begin();
+ i != blocked_by_sum.end();
+ ++i) {
+ f->open_object_section("osd");
+ f->dump_int("id", i->first);
+ f->dump_int("num_blocked", i->second);
+ f->close_section();
+ }
+ f->close_section();
+}
+void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
+{
+ TextTable tab;
+ tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT);
+ for (auto i = blocked_by_sum.begin();
+ i != blocked_by_sum.end();
+ ++i) {
+ tab << i->first;
+ tab << i->second;
+ tab << TextTable::endrow;
+ }
+ (*ss) << tab;
+}
+
+
+/**
+ * update aggregated delta
+ *
+ * @param cct ceph context
+ * @param ts Timestamp for the stats being delta'ed
+ * @param old_pool_sum Previous stats sum
+ * @param last_ts Last timestamp for pool
+ * @param result_pool_sum Resulting stats
+ * @param result_pool_delta Resulting pool delta
+ * @param result_ts_delta Resulting timestamp delta
+ * @param delta_avg_list List of last N computed deltas, used to average
+ */
+void PGMap::update_delta(
+ CephContext *cct,
+ const utime_t ts,
+ const pool_stat_t& old_pool_sum,
+ utime_t *last_ts,
+ const pool_stat_t& current_pool_sum,
+ pool_stat_t *result_pool_delta,
+ utime_t *result_ts_delta,
+ mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list)
+{
+ /* @p ts is the timestamp we want to associate with the data
+ * in @p old_pool_sum, and on which we will base ourselves to
+ * calculate the delta, stored in 'delta_t'.
+ */
+ utime_t delta_t;
+ delta_t = ts; // start with the provided timestamp
+ delta_t -= *last_ts; // take the last timestamp we saw
+ *last_ts = ts; // @p ts becomes the last timestamp we saw
+
+ // adjust delta_t, quick start if there is no update in a long period
+ delta_t = std::min(delta_t,
+ utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));
+
+ // calculate a delta, and average over the last 6 deltas by default.
+ /* start by taking a copy of our current @p result_pool_sum, and by
+ * taking out the stats from @p old_pool_sum. This generates a stats
+ * delta. Stash this stats delta in @p delta_avg_list, along with the
+ * timestamp delta for these results.
+ */
+ pool_stat_t d = current_pool_sum;
+ d.stats.sub(old_pool_sum.stats);
+
+ /* Aggregate current delta, and take out the last seen delta (if any) to
+ * average it out.
+ * Skip calculating delta while sum was not synchronized.
+ */
+ if(!old_pool_sum.stats.sum.is_zero()) {
+ delta_avg_list->push_back(make_pair(d,delta_t));
+ *result_ts_delta += delta_t;
+ result_pool_delta->stats.add(d.stats);
+ }
+ size_t s = cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
+ while (delta_avg_list->size() > s) {
+ result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
+ *result_ts_delta -= delta_avg_list->front().second;
+ delta_avg_list->pop_front();
+ }
+}
+
+/**
+ * Update a given pool's deltas
+ *
+ * @param cct Ceph Context
+ * @param ts Timestamp for the stats being delta'ed
+ * @param pool Pool's id
+ * @param old_pool_sum Previous stats sum
+ */
+void PGMap::update_one_pool_delta(
+ CephContext *cct,
+ const utime_t ts,
+ const int64_t pool,
+ const pool_stat_t& old_pool_sum)
+{
+ if (per_pool_sum_deltas.count(pool) == 0) {
+ ceph_assert(per_pool_sum_deltas_stamps.count(pool) == 0);
+ ceph_assert(per_pool_sum_delta.count(pool) == 0);
+ }
+
+ auto& sum_delta = per_pool_sum_delta[pool];
+
+ update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
+ &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
+ &per_pool_sum_deltas[pool]);
+}
+
+/**
+ * Update pools' deltas
+ *
+ * @param cct CephContext
+ * @param ts Timestamp for the stats being delta'ed
+ * @param pg_pool_sum_old Map of pool stats for delta calcs.
+ */
+void PGMap::update_pool_deltas(
+ CephContext *cct, const utime_t ts,
+ const mempool::pgmap::unordered_map<int32_t,pool_stat_t>& pg_pool_sum_old)
+{
+ for (auto it = pg_pool_sum_old.begin();
+ it != pg_pool_sum_old.end(); ++it) {
+ update_one_pool_delta(cct, ts, it->first, it->second);
+ }
+}
+
+void PGMap::clear_delta()
+{
+ pg_sum_delta = pool_stat_t();
+ pg_sum_deltas.clear();
+ stamp_delta = utime_t();
+}
+
+void PGMap::generate_test_instances(list<PGMap*>& o)
+{
+ o.push_back(new PGMap);
+ list<Incremental*> inc;
+ Incremental::generate_test_instances(inc);
+ delete inc.front();
+ inc.pop_front();
+ while (!inc.empty()) {
+ PGMap *pmp = new PGMap();
+ *pmp = *o.back();
+ o.push_back(pmp);
+ o.back()->apply_incremental(NULL, *inc.front());
+ delete inc.front();
+ inc.pop_front();
+ }
+}
+
+void PGMap::get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid,
+ bool primary, set<pg_t>& pgs) const
+{
+ for (auto i = pg_stat.begin();
+ i != pg_stat.end();
+ ++i) {
+ if ((poolid >= 0) && (poolid != i->first.pool()))
+ continue;
+ if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary)))
+ continue;
+ if (state == (uint64_t)-1 || // "all"
+ (i->second.state & state) || // matches a state bit
+ (state == 0 && i->second.state == 0)) { // matches "unknown" (== 0)
+ pgs.insert(i->first);
+ }
+ }
+}
+
+void PGMap::dump_filtered_pg_stats(ceph::Formatter *f, set<pg_t>& pgs) const
+{
+ f->open_array_section("pg_stats");
+ for (auto i = pgs.begin(); i != pgs.end(); ++i) {
+ const pg_stat_t& st = pg_stat.at(*i);
+ f->open_object_section("pg_stat");
+ f->dump_stream("pgid") << *i;
+ st.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
+{
+ TextTable tab;
+ utime_t now = ceph_clock_now();
+
+ tab.define_column("PG", TextTable::LEFT, TextTable::LEFT);
+ tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("LOG_DUPS", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("SINCE", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("LAST_SCRUB_DURATION", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("SCRUB_SCHEDULING", TextTable::LEFT, TextTable::LEFT);
+
+ for (auto i = pgs.begin(); i != pgs.end(); ++i) {
+ const pg_stat_t& st = pg_stat.at(*i);
+
+ ostringstream reported;
+ reported << st.reported_epoch << ":" << st.reported_seq;
+
+ ostringstream upstr, actingstr;
+ upstr << pg_vector_string(st.up) << 'p' << st.up_primary;
+ actingstr << pg_vector_string(st.acting) << 'p' << st.acting_primary;
+ tab << *i
+ << st.stats.sum.num_objects
+ << st.stats.sum.num_objects_degraded
+ << st.stats.sum.num_objects_misplaced
+ << st.stats.sum.num_objects_unfound
+ << st.stats.sum.num_bytes
+ << st.stats.sum.num_omap_bytes
+ << st.stats.sum.num_omap_keys
+ << st.log_size
+ << st.log_dups_size
+ << pg_state_string(st.state)
+ << utimespan_str(now - st.last_change)
+ << st.version
+ << reported.str()
+ << upstr.str()
+ << actingstr.str()
+ << st.last_scrub_stamp
+ << st.last_deep_scrub_stamp
+ << st.last_scrub_duration
+ << st.dump_scrub_schedule()
+ << TextTable::endrow;
+ }
+
+ ss << tab;
+}
+
+void PGMap::dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map,
+ ceph::Formatter *f,
+ stringstream *rs) const {
+ const string& pool_name = osd_map.get_pool_name(poolid);
+ if (f) {
+ f->open_object_section("pool");
+ f->dump_string("pool_name", pool_name.c_str());
+ f->dump_int("pool_id", poolid);
+ f->open_object_section("recovery");
+ }
+ list<string> sl;
+ stringstream tss;
+ pool_recovery_summary(f, &sl, poolid);
+ if (!f && !sl.empty()) {
+ for (auto &p : sl)
+ tss << " " << p << "\n";
+ }
+ if (f) {
+ f->close_section(); // object section recovery
+ f->open_object_section("recovery_rate");
+ }
+ ostringstream rss;
+ pool_recovery_rate_summary(f, &rss, poolid);
+ if (!f && !rss.str().empty())
+ tss << " recovery io " << rss.str() << "\n";
+ if (f) {
+ f->close_section(); // object section recovery_rate
+ f->open_object_section("client_io_rate");
+ }
+ rss.clear();
+ rss.str("");
+ pool_client_io_rate_summary(f, &rss, poolid);
+ if (!f && !rss.str().empty())
+ tss << " client io " << rss.str() << "\n";
+ // dump cache tier IO rate for cache pool
+ const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
+ if (pool->is_tier()) {
+ if (f) {
+ f->close_section(); // object section client_io_rate
+ f->open_object_section("cache_io_rate");
+ }
+ rss.clear();
+ rss.str("");
+ pool_cache_io_rate_summary(f, &rss, poolid);
+ if (!f && !rss.str().empty())
+ tss << " cache tier io " << rss.str() << "\n";
+ }
+ if (f) {
+ f->close_section(); // object section cache_io_rate
+ f->close_section(); // object section pool
+ } else {
+ *rs << "pool " << pool_name << " id " << poolid << "\n";
+ if (!tss.str().empty())
+ *rs << tss.str() << "\n";
+ else
+ *rs << " nothing is going on\n\n";
+ }
+}
+
+// Get crush parentage for an osd (skip root)
+set<std::string> PGMap::osd_parentage(const OSDMap& osdmap, int id) const
+{
+ set<std::string> reporters_by_subtree;
+ auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
+
+ auto loc = osdmap.crush->get_full_location(id);
+ for (auto& [parent_bucket_type, parent_id] : loc) {
+ // Should we show the root? Might not be too informative like "default"
+ if (parent_bucket_type != "root" &&
+ parent_bucket_type != reporter_subtree_level) {
+ reporters_by_subtree.insert(parent_id);
+ }
+ }
+ return reporters_by_subtree;
+}
+
+void PGMap::get_health_checks(
+ CephContext *cct,
+ const OSDMap& osdmap,
+ health_check_map_t *checks) const
+{
+ utime_t now = ceph_clock_now();
+ const auto max = cct->_conf.get_val<uint64_t>("mon_health_max_detail");
+ const auto& pools = osdmap.get_pools();
+
+ typedef enum pg_consequence_t {
+ UNAVAILABLE = 1, // Client IO to the pool may block
+ DEGRADED = 2, // Fewer than the requested number of replicas are present
+ BACKFILL_FULL = 3, // Backfill is blocked for space considerations
+ // This may or may not be a deadlock condition.
+ DAMAGED = 4, // The data may be missing or inconsistent on disk and
+ // requires repair
+ RECOVERY_FULL = 5 // Recovery is blocked because OSDs are full
+ } pg_consequence_t;
+
+ // For a given PG state, how should it be reported at the pool level?
+ class PgStateResponse {
+ public:
+ pg_consequence_t consequence;
+ typedef std::function< utime_t(const pg_stat_t&) > stuck_cb;
+ stuck_cb stuck_since;
+ bool invert;
+
+ PgStateResponse(const pg_consequence_t& c, stuck_cb&& s)
+ : consequence(c), stuck_since(std::move(s)), invert(false)
+ {
+ }
+
+ PgStateResponse(const pg_consequence_t& c, stuck_cb&& s, bool i)
+ : consequence(c), stuck_since(std::move(s)), invert(i)
+ {
+ }
+ };
+
+ // Record the PG state counts that contributed to a reported pool state
+ class PgCauses {
+ public:
+ // Map of PG_STATE_* to number of pgs in that state.
+ std::map<unsigned, unsigned> states;
+
+ // List of all PG IDs that had a state contributing
+ // to this health condition.
+ std::set<pg_t> pgs;
+
+ std::map<pg_t, std::string> pg_messages;
+ };
+
+ // Map of PG state to how to respond to it
+ std::map<unsigned, PgStateResponse> state_to_response = {
+ // Immediate reports
+ { PG_STATE_INCONSISTENT, {DAMAGED, {}} },
+ { PG_STATE_INCOMPLETE, {UNAVAILABLE, {}} },
+ { PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} },
+ { PG_STATE_RECOVERY_UNFOUND, {DAMAGED, {}} },
+ { PG_STATE_BACKFILL_UNFOUND, {DAMAGED, {}} },
+ { PG_STATE_BACKFILL_TOOFULL, {BACKFILL_FULL, {}} },
+ { PG_STATE_RECOVERY_TOOFULL, {RECOVERY_FULL, {}} },
+ { PG_STATE_DEGRADED, {DEGRADED, {}} },
+ { PG_STATE_DOWN, {UNAVAILABLE, {}} },
+ // Delayed (wait until stuck) reports
+ { PG_STATE_PEERING, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;} } },
+ { PG_STATE_UNDERSIZED, {DEGRADED, [](const pg_stat_t &p){return p.last_fullsized;} } },
+ { PG_STATE_STALE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;} } },
+ // Delayed and inverted reports
+ { PG_STATE_ACTIVE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} }
+ };
+
+ // Specialized state printer that takes account of inversion of
+ // ACTIVE, CLEAN checks.
+ auto state_name = [](const uint64_t &state) {
+ // Special cases for the states that are inverted checks
+ if (state == PG_STATE_CLEAN) {
+ return std::string("unclean");
+ } else if (state == PG_STATE_ACTIVE) {
+ return std::string("inactive");
+ } else {
+ return pg_state_string(state);
+ }
+ };
+
+ // Map of what is wrong to information about why, implicitly also stores
+ // the list of what is wrong.
+ std::map<pg_consequence_t, PgCauses> detected;
+
+ // Optimisation: trim down the number of checks to apply based on
+ // the summary counters
+ std::map<unsigned, PgStateResponse> possible_responses;
+ for (const auto &i : num_pg_by_state) {
+ for (const auto &j : state_to_response) {
+ if (!j.second.invert) {
+ // Check for normal tests by seeing if any pgs have the flag
+ if (i.first & j.first) {
+ possible_responses.insert(j);
+ }
+ }
+ }
+ }
+
+ for (const auto &j : state_to_response) {
+ if (j.second.invert) {
+ // Check for inverted tests by seeing if not-all pgs have the flag
+ const auto &found = num_pg_by_state.find(j.first);
+ if (found == num_pg_by_state.end() || found->second != num_pg) {
+ possible_responses.insert(j);
+ }
+ }
+ }
+
+ utime_t cutoff = now - utime_t(cct->_conf.get_val<int64_t>("mon_pg_stuck_threshold"), 0);
+ // Loop over all PGs, if there are any possibly-unhealthy states in there
+ if (!possible_responses.empty()) {
+ for (const auto& i : pg_stat) {
+ const auto &pg_id = i.first;
+ const auto &pg_info = i.second;
+
+ for (const auto &j : state_to_response) {
+ const auto &pg_response_state = j.first;
+ const auto &pg_response = j.second;
+
+ // Apply the state test
+ if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) {
+ continue;
+ }
+
+ // Apply stuckness test if needed
+ if (pg_response.stuck_since) {
+ // Delayed response, check for stuckness
+ utime_t last_whatever = pg_response.stuck_since(pg_info);
+ if (last_whatever.is_zero() &&
+ pg_info.last_change >= cutoff) {
+ // still moving, ignore
+ continue;
+ } else if (last_whatever >= cutoff) {
+ // Not stuck enough, ignore.
+ continue;
+ } else {
+
+ }
+ }
+
+ auto &causes = detected[pg_response.consequence];
+ causes.states[pg_response_state]++;
+ causes.pgs.insert(pg_id);
+
+ // Don't bother composing detail string if we have already recorded
+ // too many
+ if (causes.pg_messages.size() > max) {
+ continue;
+ }
+
+ std::ostringstream ss;
+ if (pg_response.stuck_since) {
+ utime_t since = pg_response.stuck_since(pg_info);
+ ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state);
+ if (since == utime_t()) {
+ ss << " since forever";
+ } else {
+ utime_t dur = now - since;
+ ss << " for " << utimespan_str(dur);
+ }
+ ss << ", current state " << pg_state_string(pg_info.state)
+ << ", last acting " << pg_vector_string(pg_info.acting);
+ } else {
+ ss << "pg " << pg_id << " is "
+ << pg_state_string(pg_info.state);
+ ss << ", acting " << pg_vector_string(pg_info.acting);
+ if (pg_info.stats.sum.num_objects_unfound) {
+ ss << ", " << pg_info.stats.sum.num_objects_unfound
+ << " unfound";
+ }
+ }
+
+ if (pg_info.state & PG_STATE_INCOMPLETE) {
+ const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool());
+ if (pi && pi->min_size > 1) {
+ ss << " (reducing pool "
+ << osdmap.get_pool_name(pg_id.pool())
+ << " min_size from " << (int)pi->min_size
+ << " may help; search ceph.com/docs for 'incomplete')";
+ }
+ }
+
+ causes.pg_messages[pg_id] = ss.str();
+ }
+ }
+ } else {
+ dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl;
+ }
+
+ for (const auto &i : detected) {
+ std::string health_code;
+ health_status_t sev;
+ std::string summary;
+ switch(i.first) {
+ case UNAVAILABLE:
+ health_code = "PG_AVAILABILITY";
+ sev = HEALTH_WARN;
+ summary = "Reduced data availability: ";
+ break;
+ case DEGRADED:
+ health_code = "PG_DEGRADED";
+ summary = "Degraded data redundancy: ";
+ sev = HEALTH_WARN;
+ break;
+ case BACKFILL_FULL:
+ health_code = "PG_BACKFILL_FULL";
+ summary = "Low space hindering backfill (add storage if this doesn't resolve itself): ";
+ sev = HEALTH_WARN;
+ break;
+ case DAMAGED:
+ health_code = "PG_DAMAGED";
+ summary = "Possible data damage: ";
+ sev = HEALTH_ERR;
+ break;
+ case RECOVERY_FULL:
+ health_code = "PG_RECOVERY_FULL";
+ summary = "Full OSDs blocking recovery: ";
+ sev = HEALTH_ERR;
+ break;
+ default:
+ ceph_abort();
+ }
+
+ if (i.first == DEGRADED) {
+ if (pg_sum.stats.sum.num_objects_degraded &&
+ pg_sum.stats.sum.num_object_copies > 0) {
+ double pc = (double)pg_sum.stats.sum.num_objects_degraded /
+ (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
+ char b[20];
+ snprintf(b, sizeof(b), "%.3lf", pc);
+ ostringstream ss;
+ ss << pg_sum.stats.sum.num_objects_degraded
+ << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded ("
+ << b << "%)";
+
+ // Throw in a comma for the benefit of the following PG counts
+ summary += ss.str() + ", ";
+ }
+ }
+
+ // Compose summary message saying how many PGs in what states led
+ // to this health check failing
+ std::vector<std::string> pg_msgs;
+ int64_t count = 0;
+ for (const auto &j : i.second.states) {
+ std::ostringstream msg;
+ msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
+ pg_msgs.push_back(msg.str());
+ count += j.second;
+ }
+ summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
+
+ health_check_t *check = &checks->add(
+ health_code,
+ sev,
+ summary,
+ count);
+
+ // Compose list of PGs contributing to this health check failing
+ for (const auto &j : i.second.pg_messages) {
+ check->detail.push_back(j.second);
+ }
+ }
+
+ // OSD_SCRUB_ERRORS
+ if (pg_sum.stats.sum.num_scrub_errors) {
+ ostringstream ss;
+ ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
+ checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str(),
+ pg_sum.stats.sum.num_scrub_errors);
+ }
+
+ // LARGE_OMAP_OBJECTS
+ if (pg_sum.stats.sum.num_large_omap_objects) {
+ list<string> detail;
+ for (auto &pool : pools) {
+ const string& pool_name = osdmap.get_pool_name(pool.first);
+ auto it2 = pg_pool_sum.find(pool.first);
+ if (it2 == pg_pool_sum.end()) {
+ continue;
+ }
+ const pool_stat_t *pstat = &it2->second;
+ if (pstat == nullptr) {
+ continue;
+ }
+ const object_stat_sum_t& sum = pstat->stats.sum;
+ if (sum.num_large_omap_objects) {
+ stringstream ss;
+ ss << sum.num_large_omap_objects << " large objects found in pool "
+ << "'" << pool_name << "'";
+ detail.push_back(ss.str());
+ }
+ }
+ if (!detail.empty()) {
+ ostringstream ss;
+ ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
+ auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str(),
+ pg_sum.stats.sum.num_large_omap_objects);
+ stringstream tip;
+ tip << "Search the cluster log for 'Large omap object found' for more "
+ << "details.";
+ detail.push_back(tip.str());
+ d.detail.swap(detail);
+ }
+ }
+
+ // CACHE_POOL_NEAR_FULL
+ {
+ list<string> detail;
+ unsigned num_pools = 0;
+ for (auto& p : pools) {
+ if ((!p.second.target_max_objects && !p.second.target_max_bytes) ||
+ !pg_pool_sum.count(p.first)) {
+ continue;
+ }
+ bool nearfull = false;
+ const string& name = osdmap.get_pool_name(p.first);
+ const pool_stat_t& st = get_pg_pool_sum_stat(p.first);
+ uint64_t ratio = p.second.cache_target_full_ratio_micro +
+ ((1000000 - p.second.cache_target_full_ratio_micro) *
+ cct->_conf->mon_cache_target_full_warn_ratio);
+ if (p.second.target_max_objects &&
+ (uint64_t)(st.stats.sum.num_objects -
+ st.stats.sum.num_objects_hit_set_archive) >
+ p.second.target_max_objects * (ratio / 1000000.0)) {
+ ostringstream ss;
+ ss << "cache pool '" << name << "' with "
+ << si_u_t(st.stats.sum.num_objects)
+ << " objects at/near target max "
+ << si_u_t(p.second.target_max_objects) << " objects";
+ detail.push_back(ss.str());
+ nearfull = true;
+ }
+ if (p.second.target_max_bytes &&
+ (uint64_t)(st.stats.sum.num_bytes -
+ st.stats.sum.num_bytes_hit_set_archive) >
+ p.second.target_max_bytes * (ratio / 1000000.0)) {
+ ostringstream ss;
+ ss << "cache pool '" << name
+ << "' with " << byte_u_t(st.stats.sum.num_bytes)
+ << " at/near target max "
+ << byte_u_t(p.second.target_max_bytes);
+ detail.push_back(ss.str());
+ nearfull = true;
+ }
+ if (nearfull) {
+ ++num_pools;
+ }
+ }
+ if (!detail.empty()) {
+ ostringstream ss;
+ ss << num_pools << " cache pools at or near target size";
+ auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str(),
+ num_pools);
+ d.detail.swap(detail);
+ }
+ }
+
+ // TOO_FEW_PGS
+ unsigned num_in = osdmap.get_num_in_osds();
+ auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size());
+ const auto min_pg_per_osd =
+ cct->_conf.get_val<uint64_t>("mon_pg_warn_min_per_osd");
+ if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) {
+ auto per = sum_pg_up / num_in;
+ if (per < min_pg_per_osd && per) {
+ ostringstream ss;
+ ss << "too few PGs per OSD (" << per
+ << " < min " << min_pg_per_osd << ")";
+ checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str(),
+ min_pg_per_osd - per);
+ }
+ }
+
+ // TOO_MANY_PGS
+ auto max_pg_per_osd = cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd");
+ if (num_in && max_pg_per_osd > 0) {
+ auto per = sum_pg_up / num_in;
+ if (per > max_pg_per_osd) {
+ ostringstream ss;
+ ss << "too many PGs per OSD (" << per
+ << " > max " << max_pg_per_osd << ")";
+ checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str(),
+ per - max_pg_per_osd);
+ }
+ }
+
+ // TOO_FEW_OSDS
+ auto warn_too_few_osds = cct->_conf.get_val<bool>("mon_warn_on_too_few_osds");
+ auto osd_pool_default_size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
+ if (warn_too_few_osds && osdmap.get_num_osds() < osd_pool_default_size) {
+ ostringstream ss;
+ ss << "OSD count " << osdmap.get_num_osds()
+ << " < osd_pool_default_size " << osd_pool_default_size;
+ checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str(),
+ osd_pool_default_size - osdmap.get_num_osds());
+ }
+
+ // SLOW_PING_TIME
+ // Convert milliseconds to microseconds
+ auto warn_slow_ping_time = cct->_conf.get_val<double>("mon_warn_on_slow_ping_time") * 1000;
+ auto grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
+ if (warn_slow_ping_time == 0) {
+ double ratio = cct->_conf.get_val<double>("mon_warn_on_slow_ping_ratio");
+ warn_slow_ping_time = grace;
+ warn_slow_ping_time *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
+ }
+ if (warn_slow_ping_time > 0) {
+
+ struct mon_ping_item_t {
+ uint32_t pingtime;
+ int from;
+ int to;
+ bool improving;
+
+ bool operator<(const mon_ping_item_t& rhs) const {
+ if (pingtime < rhs.pingtime)
+ return true;
+ if (pingtime > rhs.pingtime)
+ return false;
+ if (from < rhs.from)
+ return true;
+ if (from > rhs.from)
+ return false;
+ return to < rhs.to;
+ }
+ };
+
+ list<string> detail_back;
+ list<string> detail_front;
+ list<string> detail;
+ set<mon_ping_item_t> back_sorted, front_sorted;
+ for (auto i : osd_stat) {
+ for (auto j : i.second.hb_pingtime) {
+
+ // Maybe source info is old
+ if (now.sec() - j.second.last_update > grace * 60)
+ continue;
+
+ mon_ping_item_t back;
+ back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
+ back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]);
+ back.from = i.first;
+ back.to = j.first;
+ if (back.pingtime > warn_slow_ping_time) {
+ back.improving = (j.second.back_pingtime[0] < j.second.back_pingtime[1]
+ && j.second.back_pingtime[1] < j.second.back_pingtime[2]);
+ back_sorted.emplace(back);
+ }
+
+ mon_ping_item_t front;
+ front.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
+ front.pingtime = std::max(front.pingtime, j.second.front_pingtime[2]);
+ front.from = i.first;
+ front.to = j.first;
+ if (front.pingtime > warn_slow_ping_time) {
+ front.improving = (j.second.front_pingtime[0] < j.second.front_pingtime[1]
+ && j.second.front_pingtime[1] < j.second.back_pingtime[2]);
+ front_sorted.emplace(front);
+ }
+ }
+ if (i.second.num_shards_repaired >
+ cct->_conf.get_val<uint64_t>("mon_osd_warn_num_repaired")) {
+ ostringstream ss;
+ ss << "osd." << i.first << " had " << i.second.num_shards_repaired << " reads repaired";
+ detail.push_back(ss.str());
+ }
+ }
+ if (!detail.empty()) {
+ ostringstream ss;
+ ss << "Too many repaired reads on " << detail.size() << " OSDs";
+ auto& d = checks->add("OSD_TOO_MANY_REPAIRS", HEALTH_WARN, ss.str(),
+ detail.size());
+ d.detail.swap(detail);
+ }
+ int max_detail = 10;
+ for (auto &sback : boost::adaptors::reverse(back_sorted)) {
+ ostringstream ss;
+ if (max_detail == 0) {
+ ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
+ detail_back.push_back(ss.str());
+ break;
+ }
+ max_detail--;
+ ss << "Slow OSD heartbeats on back from osd." << sback.from
+ << " [" << osd_parentage(osdmap, sback.from) << "]"
+ << (osdmap.is_down(sback.from) ? " (down)" : "")
+ << " to osd." << sback.to
+ << " [" << osd_parentage(osdmap, sback.to) << "]"
+ << (osdmap.is_down(sback.to) ? " (down)" : "")
+ << " " << fixed_u_to_string(sback.pingtime, 3) << " msec"
+ << (sback.improving ? " possibly improving" : "");
+ detail_back.push_back(ss.str());
+ }
+ max_detail = 10;
+ for (auto &sfront : boost::adaptors::reverse(front_sorted)) {
+ ostringstream ss;
+ if (max_detail == 0) {
+ ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
+ detail_front.push_back(ss.str());
+ break;
+ }
+ max_detail--;
+ // Get crush parentage for each osd
+ ss << "Slow OSD heartbeats on front from osd." << sfront.from
+ << " [" << osd_parentage(osdmap, sfront.from) << "]"
+ << (osdmap.is_down(sfront.from) ? " (down)" : "")
+ << " to osd." << sfront.to
+ << " [" << osd_parentage(osdmap, sfront.to) << "]"
+ << (osdmap.is_down(sfront.to) ? " (down)" : "")
+ << " " << fixed_u_to_string(sfront.pingtime, 3) << " msec"
+ << (sfront.improving ? " possibly improving" : "");
+ detail_front.push_back(ss.str());
+ }
+ if (detail_back.size() != 0) {
+ ostringstream ss;
+ ss << "Slow OSD heartbeats on back (longest "
+ << fixed_u_to_string(back_sorted.rbegin()->pingtime, 3) << "ms)";
+ auto& d = checks->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN, ss.str(),
+ back_sorted.size());
+ d.detail.swap(detail_back);
+ }
+ if (detail_front.size() != 0) {
+ ostringstream ss;
+ ss << "Slow OSD heartbeats on front (longest "
+ << fixed_u_to_string(front_sorted.rbegin()->pingtime, 3) << "ms)";
+ auto& d = checks->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN, ss.str(),
+ front_sorted.size());
+ d.detail.swap(detail_front);
+ }
+ }
+
+ // SMALLER_PGP_NUM
+ // MANY_OBJECTS_PER_PG
+ if (!pg_stat.empty()) {
+ list<string> pgp_detail, many_detail;
+ const auto mon_pg_warn_min_objects =
+ cct->_conf.get_val<int64_t>("mon_pg_warn_min_objects");
+ const auto mon_pg_warn_min_pool_objects =
+ cct->_conf.get_val<int64_t>("mon_pg_warn_min_pool_objects");
+ const auto mon_pg_warn_max_object_skew =
+ cct->_conf.get_val<double>("mon_pg_warn_max_object_skew");
+ for (auto p = pg_pool_sum.begin();
+ p != pg_pool_sum.end();
+ ++p) {
+ const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
+ if (!pi)
+ continue; // in case osdmap changes haven't propagated to PGMap yet
+ const string& name = osdmap.get_pool_name(p->first);
+ // NOTE: we use pg_num_target and pgp_num_target for the purposes of
+ // the warnings. If the cluster is failing to converge on the target
+ // values that is a separate issue!
+ if (pi->get_pg_num_target() > pi->get_pgp_num_target() &&
+ !(name.find(".DELETED") != string::npos &&
+ cct->_conf->mon_fake_pool_delete)) {
+ ostringstream ss;
+ ss << "pool " << name << " pg_num "
+ << pi->get_pg_num_target()
+ << " > pgp_num " << pi->get_pgp_num_target();
+ pgp_detail.push_back(ss.str());
+ }
+ int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
+ if (average_objects_per_pg > 0 &&
+ pg_sum.stats.sum.num_objects >= mon_pg_warn_min_objects &&
+ p->second.stats.sum.num_objects >= mon_pg_warn_min_pool_objects) {
+ int objects_per_pg = p->second.stats.sum.num_objects /
+ pi->get_pg_num_target();
+ float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
+ if (mon_pg_warn_max_object_skew > 0 &&
+ ratio > mon_pg_warn_max_object_skew) {
+ ostringstream ss;
+ if (pi->pg_autoscale_mode != pg_pool_t::pg_autoscale_mode_t::ON) {
+ ss << "pool " << name << " objects per pg ("
+ << objects_per_pg << ") is more than " << ratio
+ << " times cluster average ("
+ << average_objects_per_pg << ")";
+ many_detail.push_back(ss.str());
+ }
+ }
+ }
+ }
+ if (!pgp_detail.empty()) {
+ ostringstream ss;
+ ss << pgp_detail.size() << " pools have pg_num > pgp_num";
+ auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str(),
+ pgp_detail.size());
+ d.detail.swap(pgp_detail);
+ }
+ if (!many_detail.empty()) {
+ ostringstream ss;
+ ss << many_detail.size() << " pools have many more objects per pg than"
+ << " average";
+ auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str(),
+ many_detail.size());
+ d.detail.swap(many_detail);
+ }
+ }
+
+ // POOL_FULL
+ // POOL_NEAR_FULL
+ {
+ float warn_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_warn_threshold")/100;
+ float crit_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_crit_threshold")/100;
+ list<string> full_detail, nearfull_detail;
+ unsigned full_pools = 0, nearfull_pools = 0;
+ for (auto it : pools) {
+ auto it2 = pg_pool_sum.find(it.first);
+ if (it2 == pg_pool_sum.end()) {
+ continue;
+ }
+ const pool_stat_t *pstat = &it2->second;
+ const object_stat_sum_t& sum = pstat->stats.sum;
+ const string& pool_name = osdmap.get_pool_name(it.first);
+ const pg_pool_t &pool = it.second;
+ bool full = false, nearfull = false;
+ if (pool.quota_max_objects > 0) {
+ stringstream ss;
+ if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
+ } else if (crit_threshold > 0 &&
+ sum.num_objects >= pool.quota_max_objects*crit_threshold) {
+ ss << "pool '" << pool_name
+ << "' has " << sum.num_objects << " objects"
+ << " (max " << pool.quota_max_objects << ")";
+ full_detail.push_back(ss.str());
+ full = true;
+ } else if (warn_threshold > 0 &&
+ sum.num_objects >= pool.quota_max_objects*warn_threshold) {
+ ss << "pool '" << pool_name
+ << "' has " << sum.num_objects << " objects"
+ << " (max " << pool.quota_max_objects << ")";
+ nearfull_detail.push_back(ss.str());
+ nearfull = true;
+ }
+ }
+ if (pool.quota_max_bytes > 0) {
+ stringstream ss;
+ if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
+ } else if (crit_threshold > 0 &&
+ sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
+ ss << "pool '" << pool_name
+ << "' has " << byte_u_t(sum.num_bytes)
+ << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
+ full_detail.push_back(ss.str());
+ full = true;
+ } else if (warn_threshold > 0 &&
+ sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
+ ss << "pool '" << pool_name
+ << "' has " << byte_u_t(sum.num_bytes)
+ << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
+ nearfull_detail.push_back(ss.str());
+ nearfull = true;
+ }
+ }
+ if (full) {
+ ++full_pools;
+ }
+ if (nearfull) {
+ ++nearfull_pools;
+ }
+ }
+ if (full_pools) {
+ ostringstream ss;
+ ss << full_pools << " pools full";
+ auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str(), full_pools);
+ d.detail.swap(full_detail);
+ }
+ if (nearfull_pools) {
+ ostringstream ss;
+ ss << nearfull_pools << " pools nearfull";
+ auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str(), nearfull_pools);
+ d.detail.swap(nearfull_detail);
+ }
+ }
+
+ // OBJECT_MISPLACED
+ if (pg_sum.stats.sum.num_objects_misplaced &&
+ pg_sum.stats.sum.num_object_copies > 0 &&
+ cct->_conf->mon_warn_on_misplaced) {
+ double pc = (double)pg_sum.stats.sum.num_objects_misplaced /
+ (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
+ char b[20];
+ snprintf(b, sizeof(b), "%.3lf", pc);
+ ostringstream ss;
+ ss << pg_sum.stats.sum.num_objects_misplaced
+ << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
+ << b << "%)";
+ checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str(),
+ pg_sum.stats.sum.num_objects_misplaced);
+ }
+
+ // OBJECT_UNFOUND
+ if (pg_sum.stats.sum.num_objects_unfound &&
+ pg_sum.stats.sum.num_objects) {
+ double pc = (double)pg_sum.stats.sum.num_objects_unfound /
+ (double)pg_sum.stats.sum.num_objects * (double)100.0;
+ char b[20];
+ snprintf(b, sizeof(b), "%.3lf", pc);
+ ostringstream ss;
+ ss << pg_sum.stats.sum.num_objects_unfound
+ << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
+ auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str(),
+ pg_sum.stats.sum.num_objects_unfound);
+
+ for (auto& p : pg_stat) {
+ if (p.second.stats.sum.num_objects_unfound) {
+ ostringstream ss;
+ ss << "pg " << p.first
+ << " has " << p.second.stats.sum.num_objects_unfound
+ << " unfound objects";
+ d.detail.push_back(ss.str());
+ if (d.detail.size() > max) {
+ d.detail.push_back("(additional pgs left out for brevity)");
+ break;
+ }
+ }
+ }
+ }
+
+ // REQUEST_SLOW
+ // REQUEST_STUCK
+ // SLOW_OPS unifies them in mimic.
+ if (osdmap.require_osd_release < ceph_release_t::mimic &&
+ cct->_conf->mon_osd_warn_op_age > 0 &&
+ !osd_sum.op_queue_age_hist.h.empty() &&
+ osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
+ cct->_conf->mon_osd_warn_op_age) {
+ list<string> warn_detail, error_detail;
+ unsigned warn = 0, error = 0;
+ float err_age =
+ cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
+ const pow2_hist_t& h = osd_sum.op_queue_age_hist;
+ for (unsigned i = h.h.size() - 1; i > 0; --i) {
+ float ub = (float)(1 << i) / 1000.0;
+ if (ub < cct->_conf->mon_osd_warn_op_age)
+ break;
+ if (h.h[i]) {
+ ostringstream ss;
+ ss << h.h[i] << " ops are blocked > " << ub << " sec";
+ if (ub > err_age) {
+ error += h.h[i];
+ error_detail.push_back(ss.str());
+ } else {
+ warn += h.h[i];
+ warn_detail.push_back(ss.str());
+ }
+ }
+ }
+
+ map<float,set<int>> warn_osd_by_max; // max -> osds
+ map<float,set<int>> error_osd_by_max; // max -> osds
+ if (!warn_detail.empty() || !error_detail.empty()) {
+ for (auto& p : osd_stat) {
+ const pow2_hist_t& h = p.second.op_queue_age_hist;
+ for (unsigned i = h.h.size() - 1; i > 0; --i) {
+ float ub = (float)(1 << i) / 1000.0;
+ if (ub < cct->_conf->mon_osd_warn_op_age)
+ break;
+ if (h.h[i]) {
+ if (ub > err_age) {
+ error_osd_by_max[ub].insert(p.first);
+ } else {
+ warn_osd_by_max[ub].insert(p.first);
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ if (!warn_detail.empty()) {
+ ostringstream ss;
+ ss << warn << " slow requests are blocked > "
+ << cct->_conf->mon_osd_warn_op_age << " sec";
+ auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str(), warn);
+ d.detail.swap(warn_detail);
+ int left = max;
+ for (auto& p : warn_osd_by_max) {
+ ostringstream ss;
+ if (p.second.size() > 1) {
+ ss << "osds " << p.second
+ << " have blocked requests > " << p.first << " sec";
+ } else {
+ ss << "osd." << *p.second.begin()
+ << " has blocked requests > " << p.first << " sec";
+ }
+ d.detail.push_back(ss.str());
+ if (--left == 0) {
+ break;
+ }
+ }
+ }
+ if (!error_detail.empty()) {
+ ostringstream ss;
+ ss << error << " stuck requests are blocked > "
+ << err_age << " sec";
+ auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str(), error);
+ d.detail.swap(error_detail);
+ int left = max;
+ for (auto& p : error_osd_by_max) {
+ ostringstream ss;
+ if (p.second.size() > 1) {
+ ss << "osds " << p.second
+ << " have stuck requests > " << p.first << " sec";
+ } else {
+ ss << "osd." << *p.second.begin()
+ << " has stuck requests > " << p.first << " sec";
+ }
+ d.detail.push_back(ss.str());
+ if (--left == 0) {
+ break;
+ }
+ }
+ }
+ }
+
+ // OBJECT_STORE_WARN
+ if (osd_sum.os_alerts.size()) {
+ map<string, pair<size_t, list<string>>> os_alerts_sum;
+
+ for (auto& a : osd_sum.os_alerts) {
+ int left = max;
+ string s0 = " osd.";
+ s0 += stringify(a.first);
+ for (auto& aa : a.second) {
+ string s(s0);
+ s += " ";
+ s += aa.second;
+ auto it = os_alerts_sum.find(aa.first);
+ if (it == os_alerts_sum.end()) {
+ list<string> d;
+ d.emplace_back(s);
+ os_alerts_sum.emplace(aa.first, std::make_pair(1, d));
+ } else {
+ auto& p = it->second;
+ ++p.first;
+ p.second.emplace_back(s);
+ }
+ if (--left == 0) {
+ break;
+ }
+ }
+ }
+
+ for (auto& asum : os_alerts_sum) {
+ string summary = stringify(asum.second.first) + " OSD(s)";
+ if (asum.first == "BLUEFS_SPILLOVER") {
+ summary += " experiencing BlueFS spillover";
+ } else if (asum.first == "BLUESTORE_NO_COMPRESSION") {
+ summary += " have broken BlueStore compression";
+ } else if (asum.first == "BLUESTORE_LEGACY_STATFS") {
+ summary += " reporting legacy (not per-pool) BlueStore stats";
+ } else if (asum.first == "BLUESTORE_DISK_SIZE_MISMATCH") {
+ summary += " have dangerous mismatch between BlueStore block device and free list sizes";
+ } else if (asum.first == "BLUESTORE_NO_PER_PG_OMAP") {
+ summary += " reporting legacy (not per-pg) BlueStore omap";
+ } else if (asum.first == "BLUESTORE_NO_PER_POOL_OMAP") {
+ summary += " reporting legacy (not per-pool) BlueStore omap usage stats";
+ } else if (asum.first == "BLUESTORE_SPURIOUS_READ_ERRORS") {
+ summary += " have spurious read errors";
+ }
+
+ auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first);
+ for (auto& s : asum.second.second) {
+ d.detail.push_back(s);
+ }
+ }
+ }
+ // PG_NOT_SCRUBBED
+ // PG_NOT_DEEP_SCRUBBED
+ if (cct->_conf->mon_warn_pg_not_scrubbed_ratio ||
+ cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
+ list<string> detail, deep_detail;
+ int detail_max = max, deep_detail_max = max;
+ int detail_more = 0, deep_detail_more = 0;
+ int detail_total = 0, deep_detail_total = 0;
+ for (auto& p : pg_stat) {
+ int64_t pnum = p.first.pool();
+ auto pool = osdmap.get_pg_pool(pnum);
+ if (!pool)
+ continue;
+ if (cct->_conf->mon_warn_pg_not_scrubbed_ratio) {
+ double scrub_max_interval = 0;
+ pool->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
+ if (scrub_max_interval <= 0) {
+ scrub_max_interval = cct->_conf->osd_scrub_max_interval;
+ }
+ const double age = (cct->_conf->mon_warn_pg_not_scrubbed_ratio * scrub_max_interval) +
+ scrub_max_interval;
+ utime_t cutoff = now;
+ cutoff -= age;
+ if (p.second.last_scrub_stamp < cutoff) {
+ if (detail_max > 0) {
+ ostringstream ss;
+ ss << "pg " << p.first << " not scrubbed since "
+ << p.second.last_scrub_stamp;
+ detail.push_back(ss.str());
+ --detail_max;
+ } else {
+ ++detail_more;
+ }
+ ++detail_total;
+ }
+ }
+ if (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
+ double deep_scrub_interval = 0;
+ pool->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
+ if (deep_scrub_interval <= 0) {
+ deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
+ }
+ double deep_age = (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio * deep_scrub_interval) +
+ deep_scrub_interval;
+ utime_t deep_cutoff = now;
+ deep_cutoff -= deep_age;
+ if (p.second.last_deep_scrub_stamp < deep_cutoff) {
+ if (deep_detail_max > 0) {
+ ostringstream ss;
+ ss << "pg " << p.first << " not deep-scrubbed since "
+ << p.second.last_deep_scrub_stamp;
+ deep_detail.push_back(ss.str());
+ --deep_detail_max;
+ } else {
+ ++deep_detail_more;
+ }
+ ++deep_detail_total;
+ }
+ }
+ }
+ if (detail_total) {
+ ostringstream ss;
+ ss << detail_total << " pgs not scrubbed in time";
+ auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str(), detail_total);
+
+ if (!detail.empty()) {
+ d.detail.swap(detail);
+
+ if (detail_more) {
+ ostringstream ss;
+ ss << detail_more << " more pgs... ";
+ d.detail.push_back(ss.str());
+ }
+ }
+ }
+ if (deep_detail_total) {
+ ostringstream ss;
+ ss << deep_detail_total << " pgs not deep-scrubbed in time";
+ auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str(),
+ deep_detail_total);
+
+ if (!deep_detail.empty()) {
+ d.detail.swap(deep_detail);
+
+ if (deep_detail_more) {
+ ostringstream ss;
+ ss << deep_detail_more << " more pgs... ";
+ d.detail.push_back(ss.str());
+ }
+ }
+ }
+ }
+
+ // POOL_APP
+ if (g_conf().get_val<bool>("mon_warn_on_pool_no_app")) {
+ list<string> detail;
+ for (auto &it : pools) {
+ const pg_pool_t &pool = it.second;
+ const string& pool_name = osdmap.get_pool_name(it.first);
+ // application metadata is not encoded until luminous is minimum
+ // required release
+ if (pool.application_metadata.empty() && !pool.is_tier()) {
+ stringstream ss;
+ ss << "application not enabled on pool '" << pool_name << "'";
+ detail.push_back(ss.str());
+ }
+ }
+ if (!detail.empty()) {
+ ostringstream ss;
+ ss << detail.size() << " pool(s) do not have an application enabled";
+ auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str(),
+ detail.size());
+ stringstream tip;
+ tip << "use 'ceph osd pool application enable <pool-name> "
+ << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
+ << "or freeform for custom applications.";
+ detail.push_back(tip.str());
+ d.detail.swap(detail);
+ }
+ }
+
+ // PG_SLOW_SNAP_TRIMMING
+ if (!pg_stat.empty() && cct->_conf->mon_osd_snap_trim_queue_warn_on > 0) {
+ uint32_t snapthreshold = cct->_conf->mon_osd_snap_trim_queue_warn_on;
+ uint64_t snaptrimq_exceeded = 0;
+ uint32_t longest_queue = 0;
+ const pg_t* longest_q_pg = nullptr;
+ list<string> detail;
+
+ for (auto& i: pg_stat) {
+ uint32_t current_len = i.second.snaptrimq_len;
+ if (current_len >= snapthreshold) {
+ snaptrimq_exceeded++;
+ if (longest_queue <= current_len) {
+ longest_q_pg = &i.first;
+ longest_queue = current_len;
+ }
+ if (detail.size() < max - 1) {
+ stringstream ss;
+ ss << "snap trim queue for pg " << i.first << " at " << current_len;
+ detail.push_back(ss.str());
+ continue;
+ }
+ if (detail.size() < max) {
+ detail.push_back("...more pgs affected");
+ continue;
+ }
+ }
+ }
+
+ if (snaptrimq_exceeded) {
+ {
+ ostringstream ss;
+ ss << "longest queue on pg " << *longest_q_pg << " at " << longest_queue;
+ detail.push_back(ss.str());
+ }
+
+ stringstream ss;
+ ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)";
+ auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str(),
+ snaptrimq_exceeded);
+ detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
+ d.detail.swap(detail);
+ }
+ }
+}
+
+void PGMap::print_summary(ceph::Formatter *f, ostream *out) const
+{
+ if (f) {
+ f->open_array_section("pgs_by_pool_state");
+ for (auto& i: num_pg_by_pool_state) {
+ f->open_object_section("per_pool_pgs_by_state");
+ f->dump_int("pool_id", i.first);
+ f->open_array_section("pg_state_counts");
+ for (auto& j : i.second) {
+ f->open_object_section("pg_state_count");
+ f->dump_string("state_name", pg_state_string(j.first));
+ f->dump_int("count", j.second);
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+ }
+ PGMapDigest::print_summary(f, out);
+}
+
+int process_pg_map_command(
+ const string& orig_prefix,
+ const cmdmap_t& orig_cmdmap,
+ const PGMap& pg_map,
+ const OSDMap& osdmap,
+ ceph::Formatter *f,
+ stringstream *ss,
+ bufferlist *odata)
+{
+ string prefix = orig_prefix;
+ auto cmdmap = orig_cmdmap;
+
+ string omap_stats_note =
+ "\n* NOTE: Omap statistics are gathered during deep scrub and "
+ "may be inaccurate soon afterwards depending on utilization. See "
+ "http://docs.ceph.com/en/latest/dev/placement-group/#omap-statistics "
+ "for further details.\n";
+ bool omap_stats_note_required = false;
+
+ // perhaps these would be better in the parsing, but it's weird
+ bool primary = false;
+ if (prefix == "pg dump_json") {
+ vector<string> v;
+ v.push_back(string("all"));
+ cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
+ prefix = "pg dump";
+ } else if (prefix == "pg dump_pools_json") {
+ vector<string> v;
+ v.push_back(string("pools"));
+ cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
+ prefix = "pg dump";
+ } else if (prefix == "pg ls-by-primary") {
+ primary = true;
+ prefix = "pg ls";
+ } else if (prefix == "pg ls-by-osd") {
+ prefix = "pg ls";
+ } else if (prefix == "pg ls-by-pool") {
+ prefix = "pg ls";
+ string poolstr;
+ cmd_getval(cmdmap, "poolstr", poolstr);
+ int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+ if (pool < 0) {
+ *ss << "pool " << poolstr << " does not exist";
+ return -ENOENT;
+ }
+ cmd_putval(g_ceph_context, cmdmap, "pool", pool);
+ }
+
+ stringstream ds;
+ if (prefix == "pg stat") {
+ if (f) {
+ f->open_object_section("pg_summary");
+ pg_map.print_oneline_summary(f, NULL);
+ f->close_section();
+ f->flush(ds);
+ } else {
+ ds << pg_map;
+ }
+ odata->append(ds);
+ return 0;
+ }
+
+ if (prefix == "pg getmap") {
+ pg_map.encode(*odata);
+ *ss << "got pgmap version " << pg_map.version;
+ return 0;
+ }
+
+ if (prefix == "pg dump") {
+ string val;
+ vector<string> dumpcontents;
+ set<string> what;
+ if (cmd_getval(cmdmap, "dumpcontents", dumpcontents)) {
+ copy(dumpcontents.begin(), dumpcontents.end(),
+ inserter(what, what.end()));
+ }
+ if (what.empty())
+ what.insert("all");
+ if (f) {
+ if (what.count("all")) {
+ f->open_object_section("pg_map");
+ pg_map.dump(f);
+ f->close_section();
+ } else if (what.count("summary") || what.count("sum")) {
+ f->open_object_section("pg_map");
+ pg_map.dump_basic(f);
+ f->close_section();
+ } else {
+ if (what.count("pools")) {
+ pg_map.dump_pool_stats(f);
+ }
+ if (what.count("osds")) {
+ pg_map.dump_osd_stats(f);
+ }
+ if (what.count("pgs")) {
+ pg_map.dump_pg_stats(f, false);
+ }
+ if (what.count("pgs_brief")) {
+ pg_map.dump_pg_stats(f, true);
+ }
+ if (what.count("delta")) {
+ f->open_object_section("delta");
+ pg_map.dump_delta(f);
+ f->close_section();
+ }
+ }
+ f->flush(*odata);
+ } else {
+ if (what.count("all")) {
+ pg_map.dump(ds);
+ omap_stats_note_required = true;
+ } else if (what.count("summary") || what.count("sum")) {
+ pg_map.dump_basic(ds);
+ pg_map.dump_pg_sum_stats(ds, true);
+ pg_map.dump_osd_sum_stats(ds);
+ omap_stats_note_required = true;
+ } else {
+ if (what.count("pgs_brief")) {
+ pg_map.dump_pg_stats(ds, true);
+ }
+ bool header = true;
+ if (what.count("pgs")) {
+ pg_map.dump_pg_stats(ds, false);
+ header = false;
+ omap_stats_note_required = true;
+ }
+ if (what.count("pools")) {
+ pg_map.dump_pool_stats(ds, header);
+ omap_stats_note_required = true;
+ }
+ if (what.count("osds")) {
+ pg_map.dump_osd_stats(ds);
+ }
+ }
+ odata->append(ds);
+ if (omap_stats_note_required) {
+ odata->append(omap_stats_note);
+ }
+ }
+ *ss << "dumped " << what;
+ return 0;
+ }
+
+ if (prefix == "pg ls") {
+ int64_t osd = -1;
+ int64_t pool = -1;
+ vector<string>states;
+ set<pg_t> pgs;
+ cmd_getval(cmdmap, "pool", pool);
+ cmd_getval(cmdmap, "osd", osd);
+ cmd_getval(cmdmap, "states", states);
+ if (pool >= 0 && !osdmap.have_pg_pool(pool)) {
+ *ss << "pool " << pool << " does not exist";
+ return -ENOENT;
+ }
+ if (osd >= 0 && !osdmap.is_up(osd)) {
+ *ss << "osd " << osd << " is not up";
+ return -EAGAIN;
+ }
+ if (states.empty())
+ states.push_back("all");
+
+ uint64_t state = 0;
+
+ while (!states.empty()) {
+ string state_str = states.back();
+
+ if (state_str == "all") {
+ state = -1;
+ break;
+ } else {
+ auto filter = pg_string_state(state_str);
+ if (!filter) {
+ *ss << "'" << state_str << "' is not a valid pg state,"
+ << " available choices: " << pg_state_string(0xFFFFFFFF);
+ return -EINVAL;
+ }
+ state |= *filter;
+ }
+
+ states.pop_back();
+ }
+
+ pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs);
+
+ if (f && !pgs.empty()) {
+ pg_map.dump_filtered_pg_stats(f, pgs);
+ f->flush(*odata);
+ } else if (!pgs.empty()) {
+ pg_map.dump_filtered_pg_stats(ds, pgs);
+ odata->append(ds);
+ odata->append(omap_stats_note);
+ }
+ return 0;
+ }
+
+ if (prefix == "pg dump_stuck") {
+ vector<string> stuckop_vec;
+ cmd_getval(cmdmap, "stuckops", stuckop_vec);
+ if (stuckop_vec.empty())
+ stuckop_vec.push_back("unclean");
+ const int64_t threshold = cmd_getval_or<int64_t>(
+ cmdmap, "threshold",
+ g_conf().get_val<int64_t>("mon_pg_stuck_threshold"));
+
+ if (pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec) < 0) {
+ *ss << "failed";
+ } else {
+ *ss << "ok";
+ }
+ odata->append(ds);
+ return 0;
+ }
+
+ if (prefix == "pg debug") {
+ const string debugop = cmd_getval_or<string>(
+ cmdmap, "debugop",
+ "unfound_objects_exist");
+ if (debugop == "unfound_objects_exist") {
+ bool unfound_objects_exist = false;
+ for (const auto& p : pg_map.pg_stat) {
+ if (p.second.stats.sum.num_objects_unfound > 0) {
+ unfound_objects_exist = true;
+ break;
+ }
+ }
+ if (unfound_objects_exist)
+ ds << "TRUE";
+ else
+ ds << "FALSE";
+ odata->append(ds);
+ return 0;
+ }
+ if (debugop == "degraded_pgs_exist") {
+ bool degraded_pgs_exist = false;
+ for (const auto& p : pg_map.pg_stat) {
+ if (p.second.stats.sum.num_objects_degraded > 0) {
+ degraded_pgs_exist = true;
+ break;
+ }
+ }
+ if (degraded_pgs_exist)
+ ds << "TRUE";
+ else
+ ds << "FALSE";
+ odata->append(ds);
+ return 0;
+ }
+ }
+
+ if (prefix == "osd perf") {
+ if (f) {
+ f->open_object_section("osdstats");
+ pg_map.dump_osd_perf_stats(f);
+ f->close_section();
+ f->flush(ds);
+ } else {
+ pg_map.print_osd_perf_stats(&ds);
+ }
+ odata->append(ds);
+ return 0;
+ }
+
+ if (prefix == "osd blocked-by") {
+ if (f) {
+ f->open_object_section("osd_blocked_by");
+ pg_map.dump_osd_blocked_by_stats(f);
+ f->close_section();
+ f->flush(ds);
+ } else {
+ pg_map.print_osd_blocked_by_stats(&ds);
+ }
+ odata->append(ds);
+ return 0;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+void PGMapUpdater::check_osd_map(
+ CephContext *cct,
+ const OSDMap& osdmap,
+ const PGMap& pgmap,
+ PGMap::Incremental *pending_inc)
+{
+ for (auto& p : pgmap.osd_stat) {
+ if (!osdmap.exists(p.first)) {
+ // remove osd_stat
+ pending_inc->rm_stat(p.first);
+ } else if (osdmap.is_out(p.first)) {
+ // zero osd_stat
+ if (p.second.statfs.total != 0) {
+ pending_inc->stat_osd_out(p.first);
+ }
+ } else if (!osdmap.is_up(p.first)) {
+ // zero the op_queue_age_hist
+ if (!p.second.op_queue_age_hist.empty()) {
+ pending_inc->stat_osd_down_up(p.first, pgmap);
+ }
+ }
+ }
+
+ // deleted pgs (pools)?
+ for (auto& p : pgmap.pg_pool_sum) {
+ if (!osdmap.have_pg_pool(p.first)) {
+ ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs"
+ << dendl;
+ for (auto& q : pgmap.pg_stat) {
+ if (q.first.pool() == p.first) {
+ pending_inc->pg_remove.insert(q.first);
+ }
+ }
+ auto q = pending_inc->pg_stat_updates.begin();
+ while (q != pending_inc->pg_stat_updates.end()) {
+ if (q->first.pool() == p.first) {
+ q = pending_inc->pg_stat_updates.erase(q);
+ } else {
+ ++q;
+ }
+ }
+ }
+ }
+
+ // new (split or new pool) or merged pgs?
+ map<int64_t,unsigned> new_pg_num;
+ for (auto& p : osdmap.get_pools()) {
+ int64_t poolid = p.first;
+ const pg_pool_t& pi = p.second;
+ auto q = pgmap.num_pg_by_pool.find(poolid);
+ unsigned my_pg_num = 0;
+ if (q != pgmap.num_pg_by_pool.end())
+ my_pg_num = q->second;
+ unsigned pg_num = pi.get_pg_num();
+ new_pg_num[poolid] = pg_num;
+ if (my_pg_num < pg_num) {
+ ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
+ << " > my pg_num " << my_pg_num << dendl;
+ for (unsigned ps = my_pg_num; ps < pg_num; ++ps) {
+ pg_t pgid(ps, poolid);
+ if (pending_inc->pg_stat_updates.count(pgid) == 0) {
+ ldout(cct,20) << __func__ << " adding " << pgid << dendl;
+ pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
+ stats.last_fresh = osdmap.get_modified();
+ stats.last_active = osdmap.get_modified();
+ stats.last_change = osdmap.get_modified();
+ stats.last_peered = osdmap.get_modified();
+ stats.last_clean = osdmap.get_modified();
+ stats.last_unstale = osdmap.get_modified();
+ stats.last_undegraded = osdmap.get_modified();
+ stats.last_fullsized = osdmap.get_modified();
+ stats.last_scrub_stamp = osdmap.get_modified();
+ stats.last_deep_scrub_stamp = osdmap.get_modified();
+ stats.last_clean_scrub_stamp = osdmap.get_modified();
+ }
+ }
+ } else if (my_pg_num > pg_num) {
+ ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
+ << " < my pg_num " << my_pg_num << dendl;
+ for (unsigned i = pg_num; i < my_pg_num; ++i) {
+ pg_t pgid(i, poolid);
+ ldout(cct,20) << __func__ << " removing merged " << pgid << dendl;
+ if (pgmap.pg_stat.count(pgid)) {
+ pending_inc->pg_remove.insert(pgid);
+ }
+ pending_inc->pg_stat_updates.erase(pgid);
+ }
+ }
+ }
+ auto i = pending_inc->pg_stat_updates.begin();
+ while (i != pending_inc->pg_stat_updates.end()) {
+ auto j = new_pg_num.find(i->first.pool());
+ if (j == new_pg_num.end() ||
+ i->first.ps() >= j->second) {
+ ldout(cct,20) << __func__ << " removing pending update to old "
+ << i->first << dendl;
+ i = pending_inc->pg_stat_updates.erase(i);
+ } else {
+ ++i;
+ }
+ }
+}
+
+static void _try_mark_pg_stale(
+ const OSDMap& osdmap,
+ pg_t pgid,
+ const pg_stat_t& cur,
+ PGMap::Incremental *pending_inc)
+{
+ if ((cur.state & PG_STATE_STALE) == 0 &&
+ cur.acting_primary != -1 &&
+ osdmap.is_down(cur.acting_primary)) {
+ pg_stat_t *newstat;
+ auto q = pending_inc->pg_stat_updates.find(pgid);
+ if (q != pending_inc->pg_stat_updates.end()) {
+ if ((q->second.acting_primary == cur.acting_primary) ||
+ ((q->second.state & PG_STATE_STALE) == 0 &&
+ q->second.acting_primary != -1 &&
+ osdmap.is_down(q->second.acting_primary))) {
+ newstat = &q->second;
+ } else {
+ // pending update is no longer down or already stale
+ return;
+ }
+ } else {
+ newstat = &pending_inc->pg_stat_updates[pgid];
+ *newstat = cur;
+ }
+ dout(10) << __func__ << " marking pg " << pgid
+ << " stale (acting_primary " << newstat->acting_primary
+ << ")" << dendl;
+ newstat->state |= PG_STATE_STALE;
+ newstat->last_unstale = ceph_clock_now();
+ }
+
+}
+
+void PGMapUpdater::check_down_pgs(
+ const OSDMap &osdmap,
+ const PGMap &pg_map,
+ bool check_all,
+ const set<int>& need_check_down_pg_osds,
+ PGMap::Incremental *pending_inc)
+{
+ // if a large number of osds changed state, just iterate over the whole
+ // pg map.
+ if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
+ g_conf().get_val<double>("mon_pg_check_down_all_threshold")) {
+ check_all = true;
+ }
+
+ if (check_all) {
+ for (const auto& p : pg_map.pg_stat) {
+ _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
+ }
+ } else {
+ for (auto osd : need_check_down_pg_osds) {
+ if (osdmap.is_down(osd)) {
+ auto p = pg_map.pg_by_osd.find(osd);
+ if (p == pg_map.pg_by_osd.end()) {
+ continue;
+ }
+ for (auto pgid : p->second) {
+ const pg_stat_t &stat = pg_map.pg_stat.at(pgid);
+ ceph_assert(stat.acting_primary == osd);
+ _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
+ }
+ }
+ }
+ }
+}
+
+int reweight::by_utilization(
+ const OSDMap &osdmap,
+ const PGMap &pgm,
+ int oload,
+ double max_changef,
+ int max_osds,
+ bool by_pg, const set<int64_t> *pools,
+ bool no_increasing,
+ mempool::osdmap::map<int32_t, uint32_t>* new_weights,
+ std::stringstream *ss,
+ std::string *out_str,
+ ceph::Formatter *f)
+{
+ if (oload <= 100) {
+ *ss << "You must give a percentage higher than 100. "
+ "The reweighting threshold will be calculated as <average-utilization> "
+ "times <input-percentage>. For example, an argument of 200 would "
+ "reweight OSDs which are twice as utilized as the average OSD.\n";
+ return -EINVAL;
+ }
+
+ vector<int> pgs_by_osd(osdmap.get_max_osd());
+
+ // Avoid putting a small number (or 0) in the denominator when calculating
+ // average_util
+ double average_util;
+ if (by_pg) {
+ // by pg mapping
+ double weight_sum = 0.0; // sum up the crush weights
+ unsigned num_pg_copies = 0;
+ int num_osds = 0;
+ for (const auto& pg : pgm.pg_stat) {
+ if (pools && pools->count(pg.first.pool()) == 0)
+ continue;
+ for (const auto acting : pg.second.acting) {
+ if (!osdmap.exists(acting)) {
+ continue;
+ }
+ if (acting >= (int)pgs_by_osd.size())
+ pgs_by_osd.resize(acting);
+ if (pgs_by_osd[acting] == 0) {
+ if (osdmap.crush->get_item_weightf(acting) <= 0) {
+ //skip if we currently can not identify item
+ continue;
+ }
+ weight_sum += osdmap.crush->get_item_weightf(acting);
+ ++num_osds;
+ }
+ ++pgs_by_osd[acting];
+ ++num_pg_copies;
+ }
+ }
+
+ if (!num_osds || (num_pg_copies / num_osds < g_conf()->mon_reweight_min_pgs_per_osd)) {
+ *ss << "Refusing to reweight: we only have " << num_pg_copies
+ << " PGs across " << num_osds << " osds!\n";
+ return -EDOM;
+ }
+
+ average_util = (double)num_pg_copies / weight_sum;
+ } else {
+ // by osd utilization
+ int num_osd = std::max<size_t>(1, pgm.osd_stat.size());
+ if ((uint64_t)pgm.osd_sum.statfs.total / num_osd
+ < g_conf()->mon_reweight_min_bytes_per_osd) {
+ *ss << "Refusing to reweight: we only have " << pgm.osd_sum.statfs.kb()
+ << " kb across all osds!\n";
+ return -EDOM;
+ }
+ if ((uint64_t)pgm.osd_sum.statfs.get_used_raw() / num_osd
+ < g_conf()->mon_reweight_min_bytes_per_osd) {
+ *ss << "Refusing to reweight: we only have "
+ << pgm.osd_sum.statfs.kb_used_raw()
+ << " kb used across all osds!\n";
+ return -EDOM;
+ }
+
+ average_util = (double)pgm.osd_sum.statfs.get_used_raw() /
+ (double)pgm.osd_sum.statfs.total;
+ }
+
+ // adjust down only if we are above the threshold
+ const double overload_util = average_util * (double)oload / 100.0;
+
+ // but aggressively adjust weights up whenever possible.
+ const double underload_util = average_util;
+
+ const unsigned max_change = (unsigned)(max_changef * (double)CEPH_OSD_IN);
+
+ ostringstream oss;
+ if (f) {
+ f->open_object_section("reweight_by_utilization");
+ f->dump_int("overload_min", oload);
+ f->dump_float("max_change", max_changef);
+ f->dump_int("max_change_osds", max_osds);
+ f->dump_float("average_utilization", average_util);
+ f->dump_float("overload_utilization", overload_util);
+ } else {
+ oss << "oload " << oload << "\n";
+ oss << "max_change " << max_changef << "\n";
+ oss << "max_change_osds " << max_osds << "\n";
+ oss.precision(4);
+ oss << "average_utilization " << std::fixed << average_util << "\n";
+ oss << "overload_utilization " << overload_util << "\n";
+ }
+ int num_changed = 0;
+
+ // precompute util for each OSD
+ std::vector<std::pair<int, float> > util_by_osd;
+ for (const auto& p : pgm.osd_stat) {
+ std::pair<int, float> osd_util;
+ osd_util.first = p.first;
+ if (by_pg) {
+ if (p.first >= (int)pgs_by_osd.size() ||
+ pgs_by_osd[p.first] == 0) {
+ // skip if this OSD does not contain any pg
+ // belonging to the specified pool(s).
+ continue;
+ }
+
+ if (osdmap.crush->get_item_weightf(p.first) <= 0) {
+ // skip if we are unable to locate item.
+ continue;
+ }
+
+ osd_util.second =
+ pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
+ } else {
+ osd_util.second =
+ (double)p.second.statfs.get_used_raw() / (double)p.second.statfs.total;
+ }
+ util_by_osd.push_back(osd_util);
+ }
+
+ // sort by absolute deviation from the mean utilization,
+ // in descending order.
+ std::sort(util_by_osd.begin(), util_by_osd.end(),
+ [average_util](std::pair<int, float> l, std::pair<int, float> r) {
+ return abs(l.second - average_util) > abs(r.second - average_util);
+ }
+ );
+
+ if (f)
+ f->open_array_section("reweights");
+
+ for (const auto& p : util_by_osd) {
+ unsigned weight = osdmap.get_weight(p.first);
+ if (weight == 0) {
+ // skip if OSD is currently out
+ continue;
+ }
+ float util = p.second;
+
+ if (util >= overload_util) {
+ // Assign a lower weight to overloaded OSDs. The current weight
+ // is a factor to take into account the original weights,
+ // to represent e.g. differing storage capacities
+ unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
+ if (weight > max_change)
+ new_weight = std::max(new_weight, weight - max_change);
+ new_weights->insert({p.first, new_weight});
+ if (f) {
+ f->open_object_section("osd");
+ f->dump_int("osd", p.first);
+ f->dump_float("weight", (float)weight / (float)CEPH_OSD_IN);
+ f->dump_float("new_weight", (float)new_weight / (float)CEPH_OSD_IN);
+ f->close_section();
+ } else {
+ oss << "osd." << p.first << " weight "
+ << (float)weight / (float)CEPH_OSD_IN << " -> "
+ << (float)new_weight / (float)CEPH_OSD_IN << "\n";
+ }
+ if (++num_changed >= max_osds)
+ break;
+ }
+ if (!no_increasing && util <= underload_util) {
+ // assign a higher weight.. if we can.
+ unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
+ new_weight = std::min(new_weight, weight + max_change);
+ if (new_weight > CEPH_OSD_IN)
+ new_weight = CEPH_OSD_IN;
+ if (new_weight > weight) {
+ new_weights->insert({p.first, new_weight});
+ oss << "osd." << p.first << " weight "
+ << (float)weight / (float)CEPH_OSD_IN << " -> "
+ << (float)new_weight / (float)CEPH_OSD_IN << "\n";
+ if (++num_changed >= max_osds)
+ break;
+ }
+ }
+ }
+ if (f) {
+ f->close_section();
+ }
+
+ OSDMap newmap;
+ newmap.deepish_copy_from(osdmap);
+ OSDMap::Incremental newinc;
+ newinc.fsid = newmap.get_fsid();
+ newinc.epoch = newmap.get_epoch() + 1;
+ newinc.new_weight = *new_weights;
+ newmap.apply_incremental(newinc);
+
+ osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
+
+ if (f) {
+ f->close_section();
+ } else {
+ *out_str += "\n";
+ *out_str += oss.str();
+ }
+ return num_changed;
+}
diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h
new file mode 100644
index 000000000..d1135f2a1
--- /dev/null
+++ b/src/mon/PGMap.h
@@ -0,0 +1,570 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/*
+ * Placement Group Map. Placement Groups are logical sets of objects
+ * that are replicated by the same set of devices. pgid=(r,hash(o)&m)
+ * where & is a bit-wise AND and m=2^k-1
+ */
+
+#ifndef CEPH_PGMAP_H
+#define CEPH_PGMAP_H
+
+#include "include/health.h"
+#include "common/debug.h"
+#include "common/TextTable.h"
+#include "osd/osd_types.h"
+#include "include/mempool.h"
+#include "mon/health_check.h"
+#include <sstream>
+
+namespace ceph { class Formatter; }
+
+class PGMapDigest {
+public:
+ MEMPOOL_CLASS_HELPERS();
+ virtual ~PGMapDigest() {}
+
+ mempool::pgmap::vector<uint64_t> osd_last_seq;
+
+ mutable std::map<int, int64_t> avail_space_by_rule;
+
+ // aggregate state, populated by PGMap child
+ int64_t num_pg = 0, num_osd = 0;
+ int64_t num_pg_active = 0;
+ int64_t num_pg_unknown = 0;
+ mempool::pgmap::unordered_map<int32_t,pool_stat_t> pg_pool_sum;
+ mempool::pgmap::map<int64_t,int64_t> num_pg_by_pool;
+ pool_stat_t pg_sum;
+ osd_stat_t osd_sum;
+ mempool::pgmap::map<std::string,osd_stat_t> osd_sum_by_class;
+ mempool::pgmap::unordered_map<uint64_t,int32_t> num_pg_by_state;
+ struct pg_count {
+ int32_t acting = 0;
+ int32_t up_not_acting = 0;
+ int32_t primary = 0;
+ void encode(ceph::buffer::list& bl) const {
+ using ceph::encode;
+ encode(acting, bl);
+ encode(up_not_acting, bl);
+ encode(primary, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ using ceph::decode;
+ decode(acting, p);
+ decode(up_not_acting, p);
+ decode(primary, p);
+ }
+ void dump(ceph::Formatter *f) const {
+ f->dump_int("acting", acting);
+ f->dump_int("up_not_acting", up_not_acting);
+ f->dump_int("primary", primary);
+ }
+ static void generate_test_instances(std::list<pg_count*>& o) {
+ o.push_back(new pg_count);
+ o.push_back(new pg_count);
+ o.back()->acting = 1;
+ o.back()->up_not_acting = 2;
+ o.back()->primary = 3;
+ }
+ };
+ mempool::pgmap::unordered_map<int32_t,pg_count> num_pg_by_osd;
+
+ mempool::pgmap::map<int64_t,interval_set<snapid_t>> purged_snaps;
+
+ bool use_per_pool_stats() const {
+ return osd_sum.num_osds == osd_sum.num_per_pool_osds;
+ }
+ bool use_per_pool_omap_stats() const {
+ return osd_sum.num_osds == osd_sum.num_per_pool_omap_osds;
+ }
+
+ // recent deltas, and summation
+ /**
+ * keep track of last deltas for each pool, calculated using
+ * @p pg_pool_sum as baseline.
+ */
+ mempool::pgmap::unordered_map<int64_t, mempool::pgmap::list<std::pair<pool_stat_t, utime_t> > > per_pool_sum_deltas;
+ /**
+ * keep track of per-pool timestamp deltas, according to last update on
+ * each pool.
+ */
+ mempool::pgmap::unordered_map<int64_t, utime_t> per_pool_sum_deltas_stamps;
+ /**
+ * keep track of sum deltas, per-pool, taking into account any previous
+ * deltas existing in @p per_pool_sum_deltas. The utime_t as second member
+ * of the pair is the timestamp referring to the last update (i.e., the first
+ * member of the pair) for a given pool.
+ */
+ mempool::pgmap::unordered_map<int64_t, std::pair<pool_stat_t,utime_t> > per_pool_sum_delta;
+
+ pool_stat_t pg_sum_delta;
+ utime_t stamp_delta;
+
+ void get_recovery_stats(
+ double *misplaced_ratio,
+ double *degraded_ratio,
+ double *inactive_ratio,
+ double *unknown_pgs_ratio) const;
+
+ void print_summary(ceph::Formatter *f, std::ostream *out) const;
+ void print_oneline_summary(ceph::Formatter *f, std::ostream *out) const;
+
+ void recovery_summary(ceph::Formatter *f, std::list<std::string> *psl,
+ const pool_stat_t& pool_sum) const;
+ void overall_recovery_summary(ceph::Formatter *f, std::list<std::string> *psl) const;
+ void pool_recovery_summary(ceph::Formatter *f, std::list<std::string> *psl,
+ uint64_t poolid) const;
+ void recovery_rate_summary(ceph::Formatter *f, std::ostream *out,
+ const pool_stat_t& delta_sum,
+ utime_t delta_stamp) const;
+ void overall_recovery_rate_summary(ceph::Formatter *f, std::ostream *out) const;
+ void pool_recovery_rate_summary(ceph::Formatter *f, std::ostream *out,
+ uint64_t poolid) const;
+ /**
+ * Obtain a formatted/plain output for client I/O, source from stats for a
+ * given @p delta_sum pool over a given @p delta_stamp period of time.
+ */
+ void client_io_rate_summary(ceph::Formatter *f, std::ostream *out,
+ const pool_stat_t& delta_sum,
+ utime_t delta_stamp) const;
+ /**
+ * Obtain a formatted/plain output for the overall client I/O, which is
+ * calculated resorting to @p pg_sum_delta and @p stamp_delta.
+ */
+ void overall_client_io_rate_summary(ceph::Formatter *f, std::ostream *out) const;
+ /**
+ * Obtain a formatted/plain output for client I/O over a given pool
+ * with id @p pool_id. We will then obtain pool-specific data
+ * from @p per_pool_sum_delta.
+ */
+ void pool_client_io_rate_summary(ceph::Formatter *f, std::ostream *out,
+ uint64_t poolid) const;
+ /**
+ * Obtain a formatted/plain output for cache tier IO, source from stats for a
+ * given @p delta_sum pool over a given @p delta_stamp period of time.
+ */
+ void cache_io_rate_summary(ceph::Formatter *f, std::ostream *out,
+ const pool_stat_t& delta_sum,
+ utime_t delta_stamp) const;
+ /**
+ * Obtain a formatted/plain output for the overall cache tier IO, which is
+ * calculated resorting to @p pg_sum_delta and @p stamp_delta.
+ */
+ void overall_cache_io_rate_summary(ceph::Formatter *f, std::ostream *out) const;
+ /**
+ * Obtain a formatted/plain output for cache tier IO over a given pool
+ * with id @p pool_id. We will then obtain pool-specific data
+ * from @p per_pool_sum_delta.
+ */
+ void pool_cache_io_rate_summary(ceph::Formatter *f, std::ostream *out,
+ uint64_t poolid) const;
+
+ /**
+ * Return the number of additional bytes that can be stored in this
+ * pool before the first OSD fills up, accounting for PG overhead.
+ */
+ int64_t get_pool_free_space(const OSDMap &osd_map, int64_t poolid) const;
+
+
+ /**
+ * Dump pool usage and io ops/bytes, used by "ceph df" command
+ */
+ virtual void dump_pool_stats_full(const OSDMap &osd_map, std::stringstream *ss,
+ ceph::Formatter *f, bool verbose) const;
+ void dump_cluster_stats(std::stringstream *ss, ceph::Formatter *f, bool verbose) const;
+ static void dump_object_stat_sum(TextTable &tbl, ceph::Formatter *f,
+ const pool_stat_t &pool_stat,
+ uint64_t avail,
+ float raw_used_rate,
+ bool verbose,
+ bool per_pool,
+ bool per_pool_omap,
+ const pg_pool_t *pool);
+
+ size_t get_num_pg_by_osd(int osd) const {
+ auto p = num_pg_by_osd.find(osd);
+ if (p == num_pg_by_osd.end())
+ return 0;
+ else
+ return p->second.acting;
+ }
+ int get_num_primary_pg_by_osd(int osd) const {
+ auto p = num_pg_by_osd.find(osd);
+ if (p == num_pg_by_osd.end())
+ return 0;
+ else
+ return p->second.primary;
+ }
+
+ ceph_statfs get_statfs(OSDMap &osdmap,
+ std::optional<int64_t> data_pool) const;
+
+ int64_t get_rule_avail(int ruleno) const {
+ auto i = avail_space_by_rule.find(ruleno);
+ if (i != avail_space_by_rule.end())
+ return avail_space_by_rule[ruleno];
+ else
+ return 0;
+ }
+
+ // kill me post-mimic or -nautilus
+ bool definitely_converted_snapsets() const {
+ // false negative is okay; false positive is not!
+ return
+ num_pg &&
+ num_pg_unknown == 0 &&
+ pg_sum.stats.sum.num_legacy_snapsets == 0;
+ }
+
+ uint64_t get_last_osd_stat_seq(int osd) {
+ if (osd < (int)osd_last_seq.size())
+ return osd_last_seq[osd];
+ return 0;
+ }
+
+ void encode(ceph::buffer::list& bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator& p);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<PGMapDigest*>& ls);
+};
+WRITE_CLASS_ENCODER(PGMapDigest::pg_count);
+WRITE_CLASS_ENCODER_FEATURES(PGMapDigest);
+
+class PGMap : public PGMapDigest {
+public:
+ MEMPOOL_CLASS_HELPERS();
+
+ // the map
+ version_t version;
+ epoch_t last_osdmap_epoch; // last osdmap epoch i applied to the pgmap
+ epoch_t last_pg_scan; // osdmap epoch
+ mempool::pgmap::unordered_map<int32_t,osd_stat_t> osd_stat;
+ mempool::pgmap::unordered_map<pg_t,pg_stat_t> pg_stat;
+
+ typedef mempool::pgmap::map<
+ std::pair<int64_t, int>, // <pool, osd>
+ store_statfs_t>
+ per_osd_pool_statfs_t;
+
+ per_osd_pool_statfs_t pool_statfs;
+
+ class Incremental {
+ public:
+ MEMPOOL_CLASS_HELPERS();
+ version_t version;
+ mempool::pgmap::map<pg_t,pg_stat_t> pg_stat_updates;
+ epoch_t osdmap_epoch;
+ epoch_t pg_scan; // osdmap epoch
+ mempool::pgmap::set<pg_t> pg_remove;
+ utime_t stamp;
+ per_osd_pool_statfs_t pool_statfs_updates;
+
+ private:
+ mempool::pgmap::map<int32_t,osd_stat_t> osd_stat_updates;
+ mempool::pgmap::set<int32_t> osd_stat_rm;
+ public:
+
+ const mempool::pgmap::map<int32_t, osd_stat_t> &get_osd_stat_updates() const {
+ return osd_stat_updates;
+ }
+ const mempool::pgmap::set<int32_t> &get_osd_stat_rm() const {
+ return osd_stat_rm;
+ }
+ template<typename OsdStat>
+ void update_stat(int32_t osd, OsdStat&& stat) {
+ osd_stat_updates[osd] = std::forward<OsdStat>(stat);
+ }
+ void stat_osd_out(int32_t osd) {
+ osd_stat_updates[osd] = osd_stat_t();
+ }
+ void stat_osd_down_up(int32_t osd, const PGMap& pg_map) {
+ // 0 the op_queue_age_hist for this osd
+ auto p = osd_stat_updates.find(osd);
+ if (p != osd_stat_updates.end()) {
+ p->second.op_queue_age_hist.clear();
+ return;
+ }
+ auto q = pg_map.osd_stat.find(osd);
+ if (q != pg_map.osd_stat.end()) {
+ osd_stat_t& t = osd_stat_updates[osd] = q->second;
+ t.op_queue_age_hist.clear();
+ }
+ }
+ void rm_stat(int32_t osd) {
+ osd_stat_rm.insert(osd);
+ osd_stat_updates.erase(osd);
+ }
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<Incremental*>& o);
+
+ Incremental() : version(0), osdmap_epoch(0), pg_scan(0) {}
+ };
+
+
+ // aggregate stats (soft state), generated by calc_stats()
+ mempool::pgmap::unordered_map<int,std::set<pg_t> > pg_by_osd;
+ mempool::pgmap::unordered_map<int,int> blocked_by_sum;
+ mempool::pgmap::list<std::pair<pool_stat_t, utime_t> > pg_sum_deltas;
+ mempool::pgmap::unordered_map<int64_t,mempool::pgmap::unordered_map<uint64_t,int32_t>> num_pg_by_pool_state;
+
+ utime_t stamp;
+
+ void update_pool_deltas(
+ CephContext *cct,
+ const utime_t ts,
+ const mempool::pgmap::unordered_map<int32_t, pool_stat_t>& pg_pool_sum_old);
+ void clear_delta();
+
+ void deleted_pool(int64_t pool) {
+ for (auto i = pool_statfs.begin(); i != pool_statfs.end();) {
+ if (i->first.first == pool) {
+ i = pool_statfs.erase(i);
+ } else {
+ ++i;
+ }
+ }
+
+ pg_pool_sum.erase(pool);
+ num_pg_by_pool_state.erase(pool);
+ num_pg_by_pool.erase(pool);
+ per_pool_sum_deltas.erase(pool);
+ per_pool_sum_deltas_stamps.erase(pool);
+ per_pool_sum_delta.erase(pool);
+ }
+
+ private:
+ void update_delta(
+ CephContext *cct,
+ const utime_t ts,
+ const pool_stat_t& old_pool_sum,
+ utime_t *last_ts,
+ const pool_stat_t& current_pool_sum,
+ pool_stat_t *result_pool_delta,
+ utime_t *result_ts_delta,
+ mempool::pgmap::list<std::pair<pool_stat_t,utime_t> > *delta_avg_list);
+
+ void update_one_pool_delta(CephContext *cct,
+ const utime_t ts,
+ const int64_t pool,
+ const pool_stat_t& old_pool_sum);
+
+ public:
+
+ mempool::pgmap::set<pg_t> creating_pgs;
+ mempool::pgmap::map<int,std::map<epoch_t,std::set<pg_t> > > creating_pgs_by_osd_epoch;
+
+ // Bits that use to be enum StuckPG
+ static const int STUCK_INACTIVE = (1<<0);
+ static const int STUCK_UNCLEAN = (1<<1);
+ static const int STUCK_UNDERSIZED = (1<<2);
+ static const int STUCK_DEGRADED = (1<<3);
+ static const int STUCK_STALE = (1<<4);
+ static const int STUCK_PEERING = (1<<5);
+
+ PGMap()
+ : version(0),
+ last_osdmap_epoch(0), last_pg_scan(0)
+ {}
+
+ version_t get_version() const {
+ return version;
+ }
+ void set_version(version_t v) {
+ version = v;
+ }
+ epoch_t get_last_osdmap_epoch() const {
+ return last_osdmap_epoch;
+ }
+ void set_last_osdmap_epoch(epoch_t e) {
+ last_osdmap_epoch = e;
+ }
+ epoch_t get_last_pg_scan() const {
+ return last_pg_scan;
+ }
+ void set_last_pg_scan(epoch_t e) {
+ last_pg_scan = e;
+ }
+ utime_t get_stamp() const {
+ return stamp;
+ }
+ void set_stamp(utime_t s) {
+ stamp = s;
+ }
+
+ pool_stat_t get_pg_pool_sum_stat(int64_t pool) const {
+ auto p = pg_pool_sum.find(pool);
+ if (p != pg_pool_sum.end())
+ return p->second;
+ return pool_stat_t();
+ }
+
+ osd_stat_t get_osd_sum(const std::set<int>& osds) const {
+ if (osds.empty()) // all
+ return osd_sum;
+ osd_stat_t sum;
+ for (auto i : osds) {
+ auto os = get_osd_stat(i);
+ if (os)
+ sum.add(*os);
+ }
+ return sum;
+ }
+
+ const osd_stat_t *get_osd_stat(int osd) const {
+ auto i = osd_stat.find(osd);
+ if (i == osd_stat.end()) {
+ return nullptr;
+ }
+ return &i->second;
+ }
+
+
+ void apply_incremental(CephContext *cct, const Incremental& inc);
+ void calc_stats();
+ void stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
+ bool sameosds=false);
+ bool stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
+ bool sameosds=false);
+ void calc_purged_snaps();
+ void calc_osd_sum_by_class(const OSDMap& osdmap);
+ void stat_osd_add(int osd, const osd_stat_t &s);
+ void stat_osd_sub(int osd, const osd_stat_t &s);
+
+ void encode(ceph::buffer::list &bl, uint64_t features=-1) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+
+ /// encode subset of our data to a PGMapDigest
+ void encode_digest(const OSDMap& osdmap,
+ ceph::buffer::list& bl, uint64_t features);
+
+ int64_t get_rule_avail(const OSDMap& osdmap, int ruleno) const;
+ void get_rules_avail(const OSDMap& osdmap,
+ std::map<int,int64_t> *avail_map) const;
+ void dump(ceph::Formatter *f, bool with_net = false) const;
+ void dump_basic(ceph::Formatter *f) const;
+ void dump_pg_stats(ceph::Formatter *f, bool brief) const;
+ void dump_pg_progress(ceph::Formatter *f) const;
+ void dump_pool_stats(ceph::Formatter *f) const;
+ void dump_osd_stats(ceph::Formatter *f, bool with_net = false) const;
+ void dump_osd_ping_times(ceph::Formatter *f) const;
+ void dump_delta(ceph::Formatter *f) const;
+ void dump_filtered_pg_stats(ceph::Formatter *f, std::set<pg_t>& pgs) const;
+ void dump_pool_stats_full(const OSDMap &osd_map, std::stringstream *ss,
+ ceph::Formatter *f, bool verbose) const override {
+ get_rules_avail(osd_map, &avail_space_by_rule);
+ PGMapDigest::dump_pool_stats_full(osd_map, ss, f, verbose);
+ }
+
+ /*
+ * Dump client io rate, recovery io rate, cache io rate and recovery information.
+ * this function is used by "ceph osd pool stats" command
+ */
+ void dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map, ceph::Formatter *f,
+ std::stringstream *ss) const;
+
+ static void dump_pg_stats_plain(
+ std::ostream& ss,
+ const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
+ bool brief);
+ void get_stuck_stats(
+ int types, const utime_t cutoff,
+ mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const;
+ void dump_stuck(ceph::Formatter *f, int types, utime_t cutoff) const;
+ void dump_stuck_plain(std::ostream& ss, int types, utime_t cutoff) const;
+ int dump_stuck_pg_stats(std::stringstream &ds,
+ ceph::Formatter *f,
+ int threshold,
+ std::vector<std::string>& args) const;
+ void dump(std::ostream& ss) const;
+ void dump_basic(std::ostream& ss) const;
+ void dump_pg_stats(std::ostream& ss, bool brief) const;
+ void dump_pg_sum_stats(std::ostream& ss, bool header) const;
+ void dump_pool_stats(std::ostream& ss, bool header) const;
+ void dump_osd_stats(std::ostream& ss) const;
+ void dump_osd_sum_stats(std::ostream& ss) const;
+ void dump_filtered_pg_stats(std::ostream& ss, std::set<pg_t>& pgs) const;
+
+ void dump_osd_perf_stats(ceph::Formatter *f) const;
+ void print_osd_perf_stats(std::ostream *ss) const;
+
+ void dump_osd_blocked_by_stats(ceph::Formatter *f) const;
+ void print_osd_blocked_by_stats(std::ostream *ss) const;
+
+ void get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid,
+ bool primary, std::set<pg_t>& pgs) const;
+
+ std::set<std::string> osd_parentage(const OSDMap& osdmap, int id) const;
+ void get_health_checks(
+ CephContext *cct,
+ const OSDMap& osdmap,
+ health_check_map_t *checks) const;
+ void print_summary(ceph::Formatter *f, std::ostream *out) const;
+
+ static void generate_test_instances(std::list<PGMap*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(PGMap)
+
+inline std::ostream& operator<<(std::ostream& out, const PGMapDigest& m) {
+ m.print_oneline_summary(NULL, &out);
+ return out;
+}
+
+int process_pg_map_command(
+ const std::string& prefix,
+ const cmdmap_t& cmdmap,
+ const PGMap& pg_map,
+ const OSDMap& osdmap,
+ ceph::Formatter *f,
+ std::stringstream *ss,
+ ceph::buffer::list *odata);
+
+class PGMapUpdater
+{
+public:
+ static void check_osd_map(
+ CephContext *cct,
+ const OSDMap &osdmap,
+ const PGMap& pg_map,
+ PGMap::Incremental *pending_inc);
+
+ // mark pg's state stale if its acting primary osd is down
+ static void check_down_pgs(
+ const OSDMap &osd_map,
+ const PGMap &pg_map,
+ bool check_all,
+ const std::set<int>& need_check_down_pg_osds,
+ PGMap::Incremental *pending_inc);
+};
+
+namespace reweight {
+/* Assign a lower weight to overloaded OSDs.
+ *
+ * The osds that will get a lower weight are those with with a utilization
+ * percentage 'oload' percent greater than the average utilization.
+ */
+ int by_utilization(const OSDMap &osd_map,
+ const PGMap &pg_map,
+ int oload,
+ double max_changef,
+ int max_osds,
+ bool by_pg, const std::set<int64_t> *pools,
+ bool no_increasing,
+ mempool::osdmap::map<int32_t, uint32_t>* new_weights,
+ std::stringstream *ss,
+ std::string *out_str,
+ ceph::Formatter *f);
+}
+
+#endif
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
new file mode 100644
index 000000000..21f244239
--- /dev/null
+++ b/src/mon/Paxos.cc
@@ -0,0 +1,1591 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <sstream>
+#include "Paxos.h"
+#include "Monitor.h"
+#include "messages/MMonPaxos.h"
+
+#include "mon/mon_types.h"
+#include "common/config.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "common/Timer.h"
+#include "messages/PaxosServiceMessage.h"
+
+using std::string;
+using std::unique_lock;
+
+using ceph::bufferlist;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::to_timespan;
+
+#define dout_subsys ceph_subsys_paxos
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, mon.name, mon.rank, paxos_name, state, first_committed, last_committed)
+static std::ostream& _prefix(std::ostream *_dout, Monitor &mon, const string& name,
+ int rank, const string& paxos_name, int state,
+ version_t first_committed, version_t last_committed)
+{
+ return *_dout << "mon." << name << "@" << rank
+ << "(" << mon.get_state_name() << ")"
+ << ".paxos(" << paxos_name << " " << Paxos::get_statename(state)
+ << " c " << first_committed << ".." << last_committed
+ << ") ";
+}
+
+class Paxos::C_Trimmed : public Context {
+ Paxos *paxos;
+public:
+ explicit C_Trimmed(Paxos *p) : paxos(p) { }
+ void finish(int r) override {
+ paxos->trimming = false;
+ }
+};
+
+MonitorDBStore *Paxos::get_store()
+{
+ return mon.store;
+}
+
+void Paxos::read_and_prepare_transactions(MonitorDBStore::TransactionRef tx,
+ version_t first, version_t last)
+{
+ dout(10) << __func__ << " first " << first << " last " << last << dendl;
+ for (version_t v = first; v <= last; ++v) {
+ dout(30) << __func__ << " apply version " << v << dendl;
+ bufferlist bl;
+ int err = get_store()->get(get_name(), v, bl);
+ ceph_assert(err == 0);
+ ceph_assert(bl.length());
+ decode_append_transaction(tx, bl);
+ }
+ dout(15) << __func__ << " total versions " << (last-first) << dendl;
+}
+
+void Paxos::init()
+{
+ // load paxos variables from stable storage
+ last_pn = get_store()->get(get_name(), "last_pn");
+ accepted_pn = get_store()->get(get_name(), "accepted_pn");
+ last_committed = get_store()->get(get_name(), "last_committed");
+ first_committed = get_store()->get(get_name(), "first_committed");
+
+ dout(10) << __func__ << " last_pn: " << last_pn << " accepted_pn: "
+ << accepted_pn << " last_committed: " << last_committed
+ << " first_committed: " << first_committed << dendl;
+
+ dout(10) << "init" << dendl;
+ ceph_assert(is_consistent());
+}
+
+void Paxos::init_logger()
+{
+ PerfCountersBuilder pcb(g_ceph_context, "paxos", l_paxos_first, l_paxos_last);
+
+ // Because monitors are so few in number, the resource cost of capturing
+ // almost all their perf counters at USEFUL is trivial.
+ pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
+ pcb.add_u64_counter(l_paxos_start_leader, "start_leader", "Starts in leader role");
+ pcb.add_u64_counter(l_paxos_start_peon, "start_peon", "Starts in peon role");
+ pcb.add_u64_counter(l_paxos_restart, "restart", "Restarts");
+ pcb.add_u64_counter(l_paxos_refresh, "refresh", "Refreshes");
+ pcb.add_time_avg(l_paxos_refresh_latency, "refresh_latency", "Refresh latency");
+ pcb.add_u64_counter(l_paxos_begin, "begin", "Started and handled begins");
+ pcb.add_u64_avg(l_paxos_begin_keys, "begin_keys", "Keys in transaction on begin");
+ pcb.add_u64_avg(l_paxos_begin_bytes, "begin_bytes", "Data in transaction on begin", NULL, 0, unit_t(UNIT_BYTES));
+ pcb.add_time_avg(l_paxos_begin_latency, "begin_latency", "Latency of begin operation");
+ pcb.add_u64_counter(l_paxos_commit, "commit",
+ "Commits", "cmt");
+ pcb.add_u64_avg(l_paxos_commit_keys, "commit_keys", "Keys in transaction on commit");
+ pcb.add_u64_avg(l_paxos_commit_bytes, "commit_bytes", "Data in transaction on commit", NULL, 0, unit_t(UNIT_BYTES));
+ pcb.add_time_avg(l_paxos_commit_latency, "commit_latency",
+ "Commit latency", "clat");
+ pcb.add_u64_counter(l_paxos_collect, "collect", "Peon collects");
+ pcb.add_u64_avg(l_paxos_collect_keys, "collect_keys", "Keys in transaction on peon collect");
+ pcb.add_u64_avg(l_paxos_collect_bytes, "collect_bytes", "Data in transaction on peon collect", NULL, 0, unit_t(UNIT_BYTES));
+ pcb.add_time_avg(l_paxos_collect_latency, "collect_latency", "Peon collect latency");
+ pcb.add_u64_counter(l_paxos_collect_uncommitted, "collect_uncommitted", "Uncommitted values in started and handled collects");
+ pcb.add_u64_counter(l_paxos_collect_timeout, "collect_timeout", "Collect timeouts");
+ pcb.add_u64_counter(l_paxos_accept_timeout, "accept_timeout", "Accept timeouts");
+ pcb.add_u64_counter(l_paxos_lease_ack_timeout, "lease_ack_timeout", "Lease acknowledgement timeouts");
+ pcb.add_u64_counter(l_paxos_lease_timeout, "lease_timeout", "Lease timeouts");
+ pcb.add_u64_counter(l_paxos_store_state, "store_state", "Store a shared state on disk");
+ pcb.add_u64_avg(l_paxos_store_state_keys, "store_state_keys", "Keys in transaction in stored state");
+ pcb.add_u64_avg(l_paxos_store_state_bytes, "store_state_bytes", "Data in transaction in stored state", NULL, 0, unit_t(UNIT_BYTES));
+ pcb.add_time_avg(l_paxos_store_state_latency, "store_state_latency", "Storing state latency");
+ pcb.add_u64_counter(l_paxos_share_state, "share_state", "Sharings of state");
+ pcb.add_u64_avg(l_paxos_share_state_keys, "share_state_keys", "Keys in shared state");
+ pcb.add_u64_avg(l_paxos_share_state_bytes, "share_state_bytes", "Data in shared state", NULL, 0, unit_t(UNIT_BYTES));
+ pcb.add_u64_counter(l_paxos_new_pn, "new_pn", "New proposal number queries");
+ pcb.add_time_avg(l_paxos_new_pn_latency, "new_pn_latency", "New proposal number getting latency");
+ logger = pcb.create_perf_counters();
+ g_ceph_context->get_perfcounters_collection()->add(logger);
+}
+
+void Paxos::dump_info(Formatter *f)
+{
+ f->open_object_section("paxos");
+ f->dump_unsigned("first_committed", first_committed);
+ f->dump_unsigned("last_committed", last_committed);
+ f->dump_unsigned("last_pn", last_pn);
+ f->dump_unsigned("accepted_pn", accepted_pn);
+ f->close_section();
+}
+
+// ---------------------------------
+
+// PHASE 1
+
+// leader
+void Paxos::collect(version_t oldpn)
+{
+ // we're recoverying, it seems!
+ state = STATE_RECOVERING;
+ ceph_assert(mon.is_leader());
+
+ // reset the number of lasts received
+ uncommitted_v = 0;
+ uncommitted_pn = 0;
+ uncommitted_value.clear();
+ peer_first_committed.clear();
+ peer_last_committed.clear();
+
+ // look for uncommitted value
+ if (get_store()->exists(get_name(), last_committed+1)) {
+ version_t v = get_store()->get(get_name(), "pending_v");
+ version_t pn = get_store()->get(get_name(), "pending_pn");
+ if (v && pn && v == last_committed + 1) {
+ uncommitted_pn = pn;
+ } else {
+ dout(10) << "WARNING: no pending_pn on disk, using previous accepted_pn " << accepted_pn
+ << " and crossing our fingers" << dendl;
+ uncommitted_pn = accepted_pn;
+ }
+ uncommitted_v = last_committed+1;
+
+ get_store()->get(get_name(), last_committed+1, uncommitted_value);
+ ceph_assert(uncommitted_value.length());
+ dout(10) << "learned uncommitted " << (last_committed+1)
+ << " pn " << uncommitted_pn
+ << " (" << uncommitted_value.length() << " bytes) from myself"
+ << dendl;
+
+ logger->inc(l_paxos_collect_uncommitted);
+ }
+
+ // pick new pn
+ accepted_pn = get_new_proposal_number(std::max(accepted_pn, oldpn));
+ accepted_pn_from = last_committed;
+ num_last = 1;
+ dout(10) << "collect with pn " << accepted_pn << dendl;
+
+ // send collect
+ for (auto p = mon.get_quorum().begin();
+ p != mon.get_quorum().end();
+ ++p) {
+ if (*p == mon.rank) continue;
+
+ MMonPaxos *collect = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_COLLECT,
+ ceph_clock_now());
+ collect->last_committed = last_committed;
+ collect->first_committed = first_committed;
+ collect->pn = accepted_pn;
+ mon.send_mon_message(collect, *p);
+ }
+
+ // set timeout event
+ collect_timeout_event = mon.timer.add_event_after(
+ g_conf()->mon_accept_timeout_factor *
+ g_conf()->mon_lease,
+ new C_MonContext{&mon, [this](int r) {
+ if (r == -ECANCELED)
+ return;
+ collect_timeout();
+ }});
+}
+
+
+// peon
+void Paxos::handle_collect(MonOpRequestRef op)
+{
+
+ op->mark_paxos_event("handle_collect");
+
+ auto collect = op->get_req<MMonPaxos>();
+ dout(10) << "handle_collect " << *collect << dendl;
+
+ ceph_assert(mon.is_peon()); // mon epoch filter should catch strays
+
+ // we're recoverying, it seems!
+ state = STATE_RECOVERING;
+
+ //update the peon recovery timeout
+ reset_lease_timeout();
+
+ if (collect->first_committed > last_committed+1) {
+ dout(2) << __func__
+ << " leader's lowest version is too high for our last committed"
+ << " (theirs: " << collect->first_committed
+ << "; ours: " << last_committed << ") -- bootstrap!" << dendl;
+ op->mark_paxos_event("need to bootstrap");
+ mon.bootstrap();
+ return;
+ }
+
+ // reply
+ MMonPaxos *last = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LAST,
+ ceph_clock_now());
+ last->last_committed = last_committed;
+ last->first_committed = first_committed;
+
+ version_t previous_pn = accepted_pn;
+
+ // can we accept this pn?
+ if (collect->pn > accepted_pn) {
+ // ok, accept it
+ accepted_pn = collect->pn;
+ accepted_pn_from = collect->pn_from;
+ dout(10) << "accepting pn " << accepted_pn << " from "
+ << accepted_pn_from << dendl;
+
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ t->put(get_name(), "accepted_pn", accepted_pn);
+
+ dout(30) << __func__ << " transaction dump:\n";
+ JSONFormatter f(true);
+ t->dump(&f);
+ f.flush(*_dout);
+ *_dout << dendl;
+
+ logger->inc(l_paxos_collect);
+ logger->inc(l_paxos_collect_keys, t->get_keys());
+ logger->inc(l_paxos_collect_bytes, t->get_bytes());
+
+ auto start = ceph::coarse_mono_clock::now();
+ get_store()->apply_transaction(t);
+ auto end = ceph::coarse_mono_clock::now();
+
+ logger->tinc(l_paxos_collect_latency, to_timespan(end - start));
+ } else {
+ // don't accept!
+ dout(10) << "NOT accepting pn " << collect->pn << " from " << collect->pn_from
+ << ", we already accepted " << accepted_pn
+ << " from " << accepted_pn_from << dendl;
+ }
+ last->pn = accepted_pn;
+ last->pn_from = accepted_pn_from;
+
+ // share whatever committed values we have
+ if (collect->last_committed < last_committed)
+ share_state(last, collect->first_committed, collect->last_committed);
+
+ // do we have an accepted but uncommitted value?
+ // (it'll be at last_committed+1)
+ bufferlist bl;
+ if (collect->last_committed <= last_committed &&
+ get_store()->exists(get_name(), last_committed+1)) {
+ get_store()->get(get_name(), last_committed+1, bl);
+ ceph_assert(bl.length() > 0);
+ dout(10) << " sharing our accepted but uncommitted value for "
+ << last_committed+1 << " (" << bl.length() << " bytes)" << dendl;
+ last->values[last_committed+1] = bl;
+
+ version_t v = get_store()->get(get_name(), "pending_v");
+ version_t pn = get_store()->get(get_name(), "pending_pn");
+ if (v && pn && v == last_committed + 1) {
+ last->uncommitted_pn = pn;
+ } else {
+ // previously we didn't record which pn a value was accepted
+ // under! use the pn value we just had... :(
+ dout(10) << "WARNING: no pending_pn on disk, using previous accepted_pn " << previous_pn
+ << " and crossing our fingers" << dendl;
+ last->uncommitted_pn = previous_pn;
+ }
+
+ logger->inc(l_paxos_collect_uncommitted);
+ }
+
+ // send reply
+ collect->get_connection()->send_message(last);
+}
+
+/**
+ * @note This is Okay. We share our versions between peer_last_committed and
+ * our last_committed (inclusive), and add their bufferlists to the
+ * message. It will be the peer's job to apply them to its store, as
+ * these bufferlists will contain raw transactions.
+ * This function is called by both the Peon and the Leader. The Peon will
+ * share the state with the Leader during handle_collect(), sharing any
+ * values the leader may be missing (i.e., the leader's last_committed is
+ * lower than the peon's last_committed). The Leader will share the state
+ * with the Peon during handle_last(), if the peon's last_committed is
+ * lower than the leader's last_committed.
+ */
+void Paxos::share_state(MMonPaxos *m, version_t peer_first_committed,
+ version_t peer_last_committed)
+{
+ ceph_assert(peer_last_committed < last_committed);
+
+ dout(10) << "share_state peer has fc " << peer_first_committed
+ << " lc " << peer_last_committed << dendl;
+ version_t v = peer_last_committed + 1;
+
+ // include incrementals
+ uint64_t bytes = 0;
+ for ( ; v <= last_committed; v++) {
+ if (get_store()->exists(get_name(), v)) {
+ get_store()->get(get_name(), v, m->values[v]);
+ ceph_assert(m->values[v].length());
+ dout(10) << " sharing " << v << " ("
+ << m->values[v].length() << " bytes)" << dendl;
+ bytes += m->values[v].length() + 16; // paxos_ + 10 digits = 16
+ }
+ }
+ logger->inc(l_paxos_share_state);
+ logger->inc(l_paxos_share_state_keys, m->values.size());
+ logger->inc(l_paxos_share_state_bytes, bytes);
+
+ m->last_committed = last_committed;
+}
+
+/**
+ * Store on disk a state that was shared with us
+ *
+ * Basically, we received a set of version. Or just one. It doesn't matter.
+ * What matters is that we have to stash it in the store. So, we will simply
+ * write every single bufferlist into their own versions on our side (i.e.,
+ * onto paxos-related keys), and then we will decode those same bufferlists
+ * we just wrote and apply the transactions they hold. We will also update
+ * our first and last committed values to point to the new values, if need
+ * be. All all this is done tightly wrapped in a transaction to ensure we
+ * enjoy the atomicity guarantees given by our awesome k/v store.
+ */
+bool Paxos::store_state(MMonPaxos *m)
+{
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ auto start = m->values.begin();
+ bool changed = false;
+
+ // build map of values to store
+ // we want to write the range [last_committed, m->last_committed] only.
+ if (start != m->values.end() &&
+ start->first > last_committed + 1) {
+ // ignore everything if values start in the future.
+ dout(10) << "store_state ignoring all values, they start at " << start->first
+ << " > last_committed+1" << dendl;
+ return false;
+ }
+
+ // push forward the start position on the message's values iterator, up until
+ // we run out of positions or we find a position matching 'last_committed'.
+ while (start != m->values.end() && start->first <= last_committed) {
+ ++start;
+ }
+
+ // make sure we get the right interval of values to apply by pushing forward
+ // the 'end' iterator until it matches the message's 'last_committed'.
+ auto end = start;
+ while (end != m->values.end() && end->first <= m->last_committed) {
+ last_committed = end->first;
+ ++end;
+ }
+
+ if (start == end) {
+ dout(10) << "store_state nothing to commit" << dendl;
+ } else {
+ dout(10) << "store_state [" << start->first << ".."
+ << last_committed << "]" << dendl;
+ t->put(get_name(), "last_committed", last_committed);
+
+ // we should apply the state here -- decode every single bufferlist in the
+ // map and append the transactions to 't'.
+ for (auto it = start; it != end; ++it) {
+ // write the bufferlist as the version's value
+ t->put(get_name(), it->first, it->second);
+ // decode the bufferlist and append it to the transaction we will shortly
+ // apply.
+ decode_append_transaction(t, it->second);
+ }
+
+ // discard obsolete uncommitted value?
+ if (uncommitted_v && uncommitted_v <= last_committed) {
+ dout(10) << " forgetting obsolete uncommitted value " << uncommitted_v
+ << " pn " << uncommitted_pn << dendl;
+ uncommitted_v = 0;
+ uncommitted_pn = 0;
+ uncommitted_value.clear();
+ }
+ }
+ if (!t->empty()) {
+ dout(30) << __func__ << " transaction dump:\n";
+ JSONFormatter f(true);
+ t->dump(&f);
+ f.flush(*_dout);
+ *_dout << dendl;
+
+ logger->inc(l_paxos_store_state);
+ logger->inc(l_paxos_store_state_bytes, t->get_bytes());
+ logger->inc(l_paxos_store_state_keys, t->get_keys());
+
+ auto start = ceph::coarse_mono_clock::now();
+ get_store()->apply_transaction(t);
+ auto end = ceph::coarse_mono_clock::now();
+
+ logger->tinc(l_paxos_store_state_latency, to_timespan(end-start));
+
+ // refresh first_committed; this txn may have trimmed.
+ first_committed = get_store()->get(get_name(), "first_committed");
+
+ _sanity_check_store();
+ changed = true;
+ }
+
+ return changed;
+}
+
+void Paxos::_sanity_check_store()
+{
+ version_t lc = get_store()->get(get_name(), "last_committed");
+ ceph_assert(lc == last_committed);
+}
+
+
+// leader
+void Paxos::handle_last(MonOpRequestRef op)
+{
+ op->mark_paxos_event("handle_last");
+ auto last = op->get_req<MMonPaxos>();
+ bool need_refresh = false;
+ int from = last->get_source().num();
+
+ dout(10) << "handle_last " << *last << dendl;
+
+ if (!mon.is_leader()) {
+ dout(10) << "not leader, dropping" << dendl;
+ return;
+ }
+
+ // note peer's first_ and last_committed, in case we learn a new
+ // commit and need to push it to them.
+ peer_first_committed[from] = last->first_committed;
+ peer_last_committed[from] = last->last_committed;
+
+ if (last->first_committed > last_committed + 1) {
+ dout(5) << __func__
+ << " mon." << from
+ << " lowest version is too high for our last committed"
+ << " (theirs: " << last->first_committed
+ << "; ours: " << last_committed << ") -- bootstrap!" << dendl;
+ op->mark_paxos_event("need to bootstrap");
+ mon.bootstrap();
+ return;
+ }
+
+ ceph_assert(g_conf()->paxos_kill_at != 1);
+
+ // store any committed values if any are specified in the message
+ need_refresh = store_state(last);
+
+ ceph_assert(g_conf()->paxos_kill_at != 2);
+
+ // is everyone contiguous and up to date?
+ for (auto p = peer_last_committed.begin();
+ p != peer_last_committed.end();
+ ++p) {
+ if (p->second + 1 < first_committed && first_committed > 1) {
+ dout(5) << __func__
+ << " peon " << p->first
+ << " last_committed (" << p->second
+ << ") is too low for our first_committed (" << first_committed
+ << ") -- bootstrap!" << dendl;
+ op->mark_paxos_event("need to bootstrap");
+ mon.bootstrap();
+ return;
+ }
+ if (p->second < last_committed) {
+ // share committed values
+ dout(10) << " sending commit to mon." << p->first << dendl;
+ MMonPaxos *commit = new MMonPaxos(mon.get_epoch(),
+ MMonPaxos::OP_COMMIT,
+ ceph_clock_now());
+ share_state(commit, peer_first_committed[p->first], p->second);
+ mon.send_mon_message(commit, p->first);
+ }
+ }
+
+ // do they accept your pn?
+ if (last->pn > accepted_pn) {
+ // no, try again.
+ dout(10) << " they had a higher pn than us, picking a new one." << dendl;
+
+ // cancel timeout event
+ mon.timer.cancel_event(collect_timeout_event);
+ collect_timeout_event = 0;
+
+ collect(last->pn);
+ } else if (last->pn == accepted_pn) {
+ // yes, they accepted our pn. great.
+ num_last++;
+ dout(10) << " they accepted our pn, we now have "
+ << num_last << " peons" << dendl;
+
+ // did this person send back an accepted but uncommitted value?
+ if (last->uncommitted_pn) {
+ if (last->uncommitted_pn >= uncommitted_pn &&
+ last->last_committed >= last_committed &&
+ last->last_committed + 1 >= uncommitted_v) {
+ uncommitted_v = last->last_committed+1;
+ uncommitted_pn = last->uncommitted_pn;
+ uncommitted_value = last->values[uncommitted_v];
+ dout(10) << "we learned an uncommitted value for " << uncommitted_v
+ << " pn " << uncommitted_pn
+ << " " << uncommitted_value.length() << " bytes"
+ << dendl;
+ } else {
+ dout(10) << "ignoring uncommitted value for " << (last->last_committed+1)
+ << " pn " << last->uncommitted_pn
+ << " " << last->values[last->last_committed+1].length() << " bytes"
+ << dendl;
+ }
+ }
+
+ // is that everyone?
+ if (num_last == mon.get_quorum().size()) {
+ // cancel timeout event
+ mon.timer.cancel_event(collect_timeout_event);
+ collect_timeout_event = 0;
+ peer_first_committed.clear();
+ peer_last_committed.clear();
+
+ // almost...
+
+ // did we learn an old value?
+ if (uncommitted_v == last_committed+1 &&
+ uncommitted_value.length()) {
+ dout(10) << "that's everyone. begin on old learned value" << dendl;
+ state = STATE_UPDATING_PREVIOUS;
+ begin(uncommitted_value);
+ } else {
+ // active!
+ dout(10) << "that's everyone. active!" << dendl;
+ extend_lease();
+
+ need_refresh = false;
+ if (do_refresh()) {
+ finish_round();
+ }
+ }
+ }
+ } else {
+ // no, this is an old message, discard
+ dout(10) << "old pn, ignoring" << dendl;
+ }
+
+ if (need_refresh)
+ (void)do_refresh();
+}
+
+void Paxos::collect_timeout()
+{
+ dout(1) << "collect timeout, calling fresh election" << dendl;
+ collect_timeout_event = 0;
+ logger->inc(l_paxos_collect_timeout);
+ ceph_assert(mon.is_leader());
+ mon.bootstrap();
+}
+
+
+// leader
+void Paxos::begin(bufferlist& v)
+{
+ dout(10) << "begin for " << last_committed+1 << " "
+ << v.length() << " bytes"
+ << dendl;
+
+ ceph_assert(mon.is_leader());
+ ceph_assert(is_updating() || is_updating_previous());
+
+ // we must already have a majority for this to work.
+ ceph_assert(mon.get_quorum().size() == 1 ||
+ num_last > (unsigned)mon.monmap->size()/2);
+
+ // and no value, yet.
+ ceph_assert(new_value.length() == 0);
+
+ // accept it ourselves
+ accepted.clear();
+ accepted.insert(mon.rank);
+ new_value = v;
+
+ if (last_committed == 0) {
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ // initial base case; set first_committed too
+ t->put(get_name(), "first_committed", 1);
+ decode_append_transaction(t, new_value);
+
+ bufferlist tx_bl;
+ t->encode(tx_bl);
+
+ new_value = tx_bl;
+ }
+
+ // store the proposed value in the store. IF it is accepted, we will then
+ // have to decode it into a transaction and apply it.
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ t->put(get_name(), last_committed+1, new_value);
+
+ // note which pn this pending value is for.
+ t->put(get_name(), "pending_v", last_committed + 1);
+ t->put(get_name(), "pending_pn", accepted_pn);
+
+ dout(30) << __func__ << " transaction dump:\n";
+ JSONFormatter f(true);
+ t->dump(&f);
+ f.flush(*_dout);
+ auto debug_tx(std::make_shared<MonitorDBStore::Transaction>());
+ auto new_value_it = new_value.cbegin();
+ debug_tx->decode(new_value_it);
+ debug_tx->dump(&f);
+ *_dout << "\nbl dump:\n";
+ f.flush(*_dout);
+ *_dout << dendl;
+
+ logger->inc(l_paxos_begin);
+ logger->inc(l_paxos_begin_keys, t->get_keys());
+ logger->inc(l_paxos_begin_bytes, t->get_bytes());
+
+ auto start = ceph::coarse_mono_clock::now();
+ get_store()->apply_transaction(t);
+ auto end = ceph::coarse_mono_clock::now();
+
+ logger->tinc(l_paxos_begin_latency, to_timespan(end - start));
+
+ ceph_assert(g_conf()->paxos_kill_at != 3);
+
+ if (mon.get_quorum().size() == 1) {
+ // we're alone, take it easy
+ commit_start();
+ return;
+ }
+
+ // ask others to accept it too!
+ for (auto p = mon.get_quorum().begin();
+ p != mon.get_quorum().end();
+ ++p) {
+ if (*p == mon.rank) continue;
+
+ dout(10) << " sending begin to mon." << *p << dendl;
+ MMonPaxos *begin = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_BEGIN,
+ ceph_clock_now());
+ begin->values[last_committed+1] = new_value;
+ begin->last_committed = last_committed;
+ begin->pn = accepted_pn;
+
+ mon.send_mon_message(begin, *p);
+ }
+
+ // set timeout event
+ accept_timeout_event = mon.timer.add_event_after(
+ g_conf()->mon_accept_timeout_factor * g_conf()->mon_lease,
+ new C_MonContext{&mon, [this](int r) {
+ if (r == -ECANCELED)
+ return;
+ accept_timeout();
+ }});
+}
+
+// peon
+void Paxos::handle_begin(MonOpRequestRef op)
+{
+ op->mark_paxos_event("handle_begin");
+ auto begin = op->get_req<MMonPaxos>();
+ dout(10) << "handle_begin " << *begin << dendl;
+
+ // can we accept this?
+ if (begin->pn < accepted_pn) {
+ dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << dendl;
+ op->mark_paxos_event("have higher pn, ignore");
+ return;
+ }
+ ceph_assert(begin->pn == accepted_pn);
+ ceph_assert(begin->last_committed == last_committed);
+
+ ceph_assert(g_conf()->paxos_kill_at != 4);
+
+ logger->inc(l_paxos_begin);
+
+ // set state.
+ state = STATE_UPDATING;
+ lease_expire = {}; // cancel lease
+
+ // yes.
+ version_t v = last_committed+1;
+ dout(10) << "accepting value for " << v << " pn " << accepted_pn << dendl;
+ // store the accepted value onto our store. We will have to decode it and
+ // apply its transaction once we receive permission to commit.
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ t->put(get_name(), v, begin->values[v]);
+
+ // note which pn this pending value is for.
+ t->put(get_name(), "pending_v", v);
+ t->put(get_name(), "pending_pn", accepted_pn);
+
+ dout(30) << __func__ << " transaction dump:\n";
+ JSONFormatter f(true);
+ t->dump(&f);
+ f.flush(*_dout);
+ *_dout << dendl;
+
+ logger->inc(l_paxos_begin_bytes, t->get_bytes());
+
+ auto start = ceph::coarse_mono_clock::now();
+ get_store()->apply_transaction(t);
+ auto end = ceph::coarse_mono_clock::now();
+
+ logger->tinc(l_paxos_begin_latency, to_timespan(end - start));
+
+ ceph_assert(g_conf()->paxos_kill_at != 5);
+
+ // reply
+ MMonPaxos *accept = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_ACCEPT,
+ ceph_clock_now());
+ accept->pn = accepted_pn;
+ accept->last_committed = last_committed;
+ begin->get_connection()->send_message(accept);
+}
+
+// leader
+void Paxos::handle_accept(MonOpRequestRef op)
+{
+ op->mark_paxos_event("handle_accept");
+ auto accept = op->get_req<MMonPaxos>();
+ dout(10) << "handle_accept " << *accept << dendl;
+ int from = accept->get_source().num();
+
+ if (accept->pn != accepted_pn) {
+ // we accepted a higher pn, from some other leader
+ dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << dendl;
+ op->mark_paxos_event("have higher pn, ignore");
+ return;
+ }
+ if (last_committed > 0 &&
+ accept->last_committed < last_committed-1) {
+ dout(10) << " this is from an old round, ignoring" << dendl;
+ op->mark_paxos_event("old round, ignore");
+ return;
+ }
+ ceph_assert(accept->last_committed == last_committed || // not committed
+ accept->last_committed == last_committed-1); // committed
+
+ ceph_assert(is_updating() || is_updating_previous());
+ ceph_assert(accepted.count(from) == 0);
+ accepted.insert(from);
+ dout(10) << " now " << accepted << " have accepted" << dendl;
+
+ ceph_assert(g_conf()->paxos_kill_at != 6);
+
+ // only commit (and expose committed state) when we get *all* quorum
+ // members to accept. otherwise, they may still be sharing the now
+ // stale state.
+ // FIXME: we can improve this with an additional lease revocation message
+ // that doesn't block for the persist.
+ if (accepted == mon.get_quorum()) {
+ // yay, commit!
+ dout(10) << " got majority, committing, done with update" << dendl;
+ op->mark_paxos_event("commit_start");
+ commit_start();
+ }
+}
+
+void Paxos::accept_timeout()
+{
+ dout(1) << "accept timeout, calling fresh election" << dendl;
+ accept_timeout_event = 0;
+ ceph_assert(mon.is_leader());
+ ceph_assert(is_updating() || is_updating_previous() || is_writing() ||
+ is_writing_previous());
+ logger->inc(l_paxos_accept_timeout);
+ mon.bootstrap();
+}
+
+struct C_Committed : public Context {
+ Paxos *paxos;
+ explicit C_Committed(Paxos *p) : paxos(p) {}
+ void finish(int r) override {
+ ceph_assert(r >= 0);
+ std::lock_guard l(paxos->mon.lock);
+ if (paxos->is_shutdown()) {
+ paxos->abort_commit();
+ return;
+ }
+ paxos->commit_finish();
+ }
+};
+
+void Paxos::abort_commit()
+{
+ ceph_assert(commits_started > 0);
+ --commits_started;
+ if (commits_started == 0)
+ shutdown_cond.notify_all();
+}
+
+void Paxos::commit_start()
+{
+ dout(10) << __func__ << " " << (last_committed+1) << dendl;
+
+ ceph_assert(g_conf()->paxos_kill_at != 7);
+
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+
+ // commit locally
+ t->put(get_name(), "last_committed", last_committed + 1);
+
+ // decode the value and apply its transaction to the store.
+ // this value can now be read from last_committed.
+ decode_append_transaction(t, new_value);
+
+ dout(30) << __func__ << " transaction dump:\n";
+ JSONFormatter f(true);
+ t->dump(&f);
+ f.flush(*_dout);
+ *_dout << dendl;
+
+ logger->inc(l_paxos_commit);
+ logger->inc(l_paxos_commit_keys, t->get_keys());
+ logger->inc(l_paxos_commit_bytes, t->get_bytes());
+ commit_start_stamp = ceph_clock_now();
+
+ get_store()->queue_transaction(t, new C_Committed(this));
+
+ if (is_updating_previous())
+ state = STATE_WRITING_PREVIOUS;
+ else if (is_updating())
+ state = STATE_WRITING;
+ else
+ ceph_abort();
+ ++commits_started;
+
+ if (mon.get_quorum().size() > 1) {
+ // cancel timeout event
+ mon.timer.cancel_event(accept_timeout_event);
+ accept_timeout_event = 0;
+ }
+}
+
+void Paxos::commit_finish()
+{
+ dout(20) << __func__ << " " << (last_committed+1) << dendl;
+ utime_t end = ceph_clock_now();
+ logger->tinc(l_paxos_commit_latency, end - commit_start_stamp);
+
+ ceph_assert(g_conf()->paxos_kill_at != 8);
+
+ // cancel lease - it was for the old value.
+ // (this would only happen if message layer lost the 'begin', but
+ // leader still got a majority and committed with out us.)
+ lease_expire = {}; // cancel lease
+
+ last_committed++;
+ last_commit_time = ceph_clock_now();
+
+ // refresh first_committed; this txn may have trimmed.
+ first_committed = get_store()->get(get_name(), "first_committed");
+
+ _sanity_check_store();
+
+ // tell everyone
+ for (auto p = mon.get_quorum().begin();
+ p != mon.get_quorum().end();
+ ++p) {
+ if (*p == mon.rank) continue;
+
+ dout(10) << " sending commit to mon." << *p << dendl;
+ MMonPaxos *commit = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_COMMIT,
+ ceph_clock_now());
+ commit->values[last_committed] = new_value;
+ commit->pn = accepted_pn;
+ commit->last_committed = last_committed;
+
+ mon.send_mon_message(commit, *p);
+ }
+
+ ceph_assert(g_conf()->paxos_kill_at != 9);
+
+ // get ready for a new round.
+ new_value.clear();
+
+ // WRITING -> REFRESH
+ // among other things, this lets do_refresh() -> mon.bootstrap() ->
+ // wait_for_paxos_write() know that it doesn't need to flush the store
+ // queue. and it should not, as we are in the async completion thread now!
+ ceph_assert(is_writing() || is_writing_previous());
+ state = STATE_REFRESH;
+ ceph_assert(commits_started > 0);
+ --commits_started;
+
+ if (do_refresh()) {
+ commit_proposal();
+ if (mon.get_quorum().size() > 1) {
+ extend_lease();
+ }
+
+ ceph_assert(g_conf()->paxos_kill_at != 10);
+
+ finish_round();
+ }
+}
+
+
+void Paxos::handle_commit(MonOpRequestRef op)
+{
+ op->mark_paxos_event("handle_commit");
+ auto commit = op->get_req<MMonPaxos>();
+ dout(10) << "handle_commit on " << commit->last_committed << dendl;
+
+ logger->inc(l_paxos_commit);
+
+ if (!mon.is_peon()) {
+ dout(10) << "not a peon, dropping" << dendl;
+ ceph_abort();
+ return;
+ }
+
+ op->mark_paxos_event("store_state");
+ store_state(commit);
+
+ (void)do_refresh();
+}
+
+void Paxos::extend_lease()
+{
+ ceph_assert(mon.is_leader());
+ //assert(is_active());
+
+ lease_expire = ceph::real_clock::now();
+ lease_expire += ceph::make_timespan(g_conf()->mon_lease);
+ acked_lease.clear();
+ acked_lease.insert(mon.rank);
+
+ dout(7) << "extend_lease now+" << g_conf()->mon_lease
+ << " (" << lease_expire << ")" << dendl;
+
+ // bcast
+ for (auto p = mon.get_quorum().begin();
+ p != mon.get_quorum().end(); ++p) {
+
+ if (*p == mon.rank) continue;
+ MMonPaxos *lease = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LEASE,
+ ceph_clock_now());
+ lease->last_committed = last_committed;
+ lease->lease_timestamp = utime_t{lease_expire};
+ lease->first_committed = first_committed;
+ mon.send_mon_message(lease, *p);
+ }
+
+ // set timeout event.
+ // if old timeout is still in place, leave it.
+ if (!lease_ack_timeout_event) {
+ lease_ack_timeout_event = mon.timer.add_event_after(
+ g_conf()->mon_lease_ack_timeout_factor * g_conf()->mon_lease,
+ new C_MonContext{&mon, [this](int r) {
+ if (r == -ECANCELED)
+ return;
+ lease_ack_timeout();
+ }});
+ }
+
+ // set renew event
+ auto at = lease_expire;
+ at -= ceph::make_timespan(g_conf()->mon_lease);
+ at += ceph::make_timespan(g_conf()->mon_lease_renew_interval_factor *
+ g_conf()->mon_lease);
+ lease_renew_event = mon.timer.add_event_at(
+ at, new C_MonContext{&mon, [this](int r) {
+ if (r == -ECANCELED)
+ return;
+ lease_renew_timeout();
+ }});
+}
+
+void Paxos::warn_on_future_time(utime_t t, entity_name_t from)
+{
+ utime_t now = ceph_clock_now();
+ if (t > now) {
+ utime_t diff = t - now;
+ if (diff > g_conf()->mon_clock_drift_allowed) {
+ utime_t warn_diff = now - last_clock_drift_warn;
+ if (warn_diff >
+ pow(g_conf()->mon_clock_drift_warn_backoff, clock_drift_warned)) {
+ mon.clog->warn() << "message from " << from << " was stamped " << diff
+ << "s in the future, clocks not synchronized";
+ last_clock_drift_warn = ceph_clock_now();
+ ++clock_drift_warned;
+ }
+ }
+ }
+
+}
+
+bool Paxos::do_refresh()
+{
+ bool need_bootstrap = false;
+
+ // make sure we have the latest state loaded up
+ auto start = ceph::coarse_mono_clock::now();
+ mon.refresh_from_paxos(&need_bootstrap);
+ auto end = ceph::coarse_mono_clock::now();
+
+ logger->inc(l_paxos_refresh);
+ logger->tinc(l_paxos_refresh_latency, to_timespan(end - start));
+
+ if (need_bootstrap) {
+ dout(10) << " doing requested bootstrap" << dendl;
+ mon.bootstrap();
+ return false;
+ }
+
+ return true;
+}
+
+void Paxos::commit_proposal()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(mon.is_leader());
+ ceph_assert(is_refresh());
+
+ finish_contexts(g_ceph_context, committing_finishers);
+}
+
+void Paxos::finish_round()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(mon.is_leader());
+
+ // ok, now go active!
+ state = STATE_ACTIVE;
+
+ dout(20) << __func__ << " waiting_for_acting" << dendl;
+ finish_contexts(g_ceph_context, waiting_for_active);
+ dout(20) << __func__ << " waiting_for_readable" << dendl;
+ finish_contexts(g_ceph_context, waiting_for_readable);
+ dout(20) << __func__ << " waiting_for_writeable" << dendl;
+ finish_contexts(g_ceph_context, waiting_for_writeable);
+
+ dout(10) << __func__ << " done w/ waiters, state " << get_statename(state) << dendl;
+
+ if (should_trim()) {
+ trim();
+ }
+
+ if (is_active() && pending_proposal) {
+ propose_pending();
+ }
+}
+
+
+// peon
+void Paxos::handle_lease(MonOpRequestRef op)
+{
+ op->mark_paxos_event("handle_lease");
+ auto lease = op->get_req<MMonPaxos>();
+ // sanity
+ if (!mon.is_peon() ||
+ last_committed != lease->last_committed) {
+ dout(10) << "handle_lease i'm not a peon, or they're not the leader,"
+ << " or the last_committed doesn't match, dropping" << dendl;
+ op->mark_paxos_event("invalid lease, ignore");
+ return;
+ }
+
+ warn_on_future_time(lease->sent_timestamp, lease->get_source());
+
+ // extend lease
+ if (auto new_expire = lease->lease_timestamp.to_real_time();
+ lease_expire < new_expire) {
+ lease_expire = new_expire;
+
+ auto now = ceph::real_clock::now();
+ if (lease_expire < now) {
+ auto diff = now - lease_expire;
+ derr << "lease_expire from " << lease->get_source_inst() << " is " << diff << " seconds in the past; mons are probably laggy (or possibly clocks are too skewed)" << dendl;
+ }
+ }
+
+ state = STATE_ACTIVE;
+
+ dout(10) << "handle_lease on " << lease->last_committed
+ << " now " << lease_expire << dendl;
+
+ // ack
+ MMonPaxos *ack = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LEASE_ACK,
+ ceph_clock_now());
+ ack->last_committed = last_committed;
+ ack->first_committed = first_committed;
+ ack->lease_timestamp = ceph_clock_now();
+ encode(mon.session_map.feature_map, ack->feature_map);
+ lease->get_connection()->send_message(ack);
+
+ // (re)set timeout event.
+ reset_lease_timeout();
+
+ // kick waiters
+ finish_contexts(g_ceph_context, waiting_for_active);
+ if (is_readable())
+ finish_contexts(g_ceph_context, waiting_for_readable);
+}
+
+void Paxos::handle_lease_ack(MonOpRequestRef op)
+{
+ op->mark_paxos_event("handle_lease_ack");
+ auto ack = op->get_req<MMonPaxos>();
+ int from = ack->get_source().num();
+
+ if (!lease_ack_timeout_event) {
+ dout(10) << "handle_lease_ack from " << ack->get_source()
+ << " -- stray (probably since revoked)" << dendl;
+
+ } else if (acked_lease.count(from) == 0) {
+ acked_lease.insert(from);
+ if (ack->feature_map.length()) {
+ auto p = ack->feature_map.cbegin();
+ FeatureMap& t = mon.quorum_feature_map[from];
+ decode(t, p);
+ }
+ if (acked_lease == mon.get_quorum()) {
+ // yay!
+ dout(10) << "handle_lease_ack from " << ack->get_source()
+ << " -- got everyone" << dendl;
+ mon.timer.cancel_event(lease_ack_timeout_event);
+ lease_ack_timeout_event = 0;
+
+
+ } else {
+ dout(10) << "handle_lease_ack from " << ack->get_source()
+ << " -- still need "
+ << mon.get_quorum().size() - acked_lease.size()
+ << " more" << dendl;
+ }
+ } else {
+ dout(10) << "handle_lease_ack from " << ack->get_source()
+ << " dup (lagging!), ignoring" << dendl;
+ }
+
+ warn_on_future_time(ack->sent_timestamp, ack->get_source());
+}
+
+void Paxos::lease_ack_timeout()
+{
+ dout(1) << "lease_ack_timeout -- calling new election" << dendl;
+ ceph_assert(mon.is_leader());
+ ceph_assert(is_active());
+ logger->inc(l_paxos_lease_ack_timeout);
+ lease_ack_timeout_event = 0;
+ mon.bootstrap();
+}
+
+void Paxos::reset_lease_timeout()
+{
+ dout(20) << "reset_lease_timeout - setting timeout event" << dendl;
+ if (lease_timeout_event)
+ mon.timer.cancel_event(lease_timeout_event);
+ lease_timeout_event = mon.timer.add_event_after(
+ g_conf()->mon_lease_ack_timeout_factor * g_conf()->mon_lease,
+ new C_MonContext{&mon, [this](int r) {
+ if (r == -ECANCELED)
+ return;
+ lease_timeout();
+ }});
+}
+
+void Paxos::lease_timeout()
+{
+ dout(1) << "lease_timeout -- calling new election" << dendl;
+ ceph_assert(mon.is_peon());
+ logger->inc(l_paxos_lease_timeout);
+ lease_timeout_event = 0;
+ mon.bootstrap();
+}
+
+void Paxos::lease_renew_timeout()
+{
+ lease_renew_event = 0;
+ extend_lease();
+}
+
+
+/*
+ * trim old states
+ */
+void Paxos::trim()
+{
+ ceph_assert(should_trim());
+ version_t end = std::min(get_version() - g_conf()->paxos_min,
+ get_first_committed() + g_conf()->paxos_trim_max);
+
+ if (first_committed >= end)
+ return;
+
+ dout(10) << "trim to " << end << " (was " << first_committed << ")" << dendl;
+
+ MonitorDBStore::TransactionRef t = get_pending_transaction();
+
+ for (version_t v = first_committed; v < end; ++v) {
+ dout(10) << "trim " << v << dendl;
+ t->erase(get_name(), v);
+ }
+ t->put(get_name(), "first_committed", end);
+ if (g_conf()->mon_compact_on_trim) {
+ dout(10) << " compacting trimmed range" << dendl;
+ t->compact_range(get_name(), stringify(first_committed - 1), stringify(end));
+ }
+
+ trimming = true;
+ queue_pending_finisher(new C_Trimmed(this));
+}
+
+/*
+ * return a globally unique, monotonically increasing proposal number
+ */
+version_t Paxos::get_new_proposal_number(version_t gt)
+{
+ if (last_pn < gt)
+ last_pn = gt;
+
+ // update. make it unique among all monitors.
+ last_pn /= 100;
+ last_pn++;
+ last_pn *= 100;
+ last_pn += (version_t)mon.rank;
+
+ // write
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ t->put(get_name(), "last_pn", last_pn);
+
+ dout(30) << __func__ << " transaction dump:\n";
+ JSONFormatter f(true);
+ t->dump(&f);
+ f.flush(*_dout);
+ *_dout << dendl;
+
+ logger->inc(l_paxos_new_pn);
+
+ auto start = ceph::coarse_mono_clock::now();
+ get_store()->apply_transaction(t);
+ auto end = ceph::coarse_mono_clock::now();
+
+ logger->tinc(l_paxos_new_pn_latency, to_timespan(end - start));
+
+ dout(10) << "get_new_proposal_number = " << last_pn << dendl;
+ return last_pn;
+}
+
+
+void Paxos::cancel_events()
+{
+ if (collect_timeout_event) {
+ mon.timer.cancel_event(collect_timeout_event);
+ collect_timeout_event = 0;
+ }
+ if (accept_timeout_event) {
+ mon.timer.cancel_event(accept_timeout_event);
+ accept_timeout_event = 0;
+ }
+ if (lease_renew_event) {
+ mon.timer.cancel_event(lease_renew_event);
+ lease_renew_event = 0;
+ }
+ if (lease_ack_timeout_event) {
+ mon.timer.cancel_event(lease_ack_timeout_event);
+ lease_ack_timeout_event = 0;
+ }
+ if (lease_timeout_event) {
+ mon.timer.cancel_event(lease_timeout_event);
+ lease_timeout_event = 0;
+ }
+}
+
+void Paxos::shutdown()
+{
+ dout(10) << __func__ << " cancel all contexts" << dendl;
+
+ state = STATE_SHUTDOWN;
+
+ // discard pending transaction
+ pending_proposal.reset();
+
+ // Let store finish commits in progress
+ // XXX: I assume I can't use finish_contexts() because the store
+ // is going to trigger
+ unique_lock l{mon.lock, std::adopt_lock};
+ shutdown_cond.wait(l, [this] { return commits_started <= 0; });
+ // Monitor::shutdown() will unlock it
+ l.release();
+
+ finish_contexts(g_ceph_context, waiting_for_writeable, -ECANCELED);
+ finish_contexts(g_ceph_context, waiting_for_readable, -ECANCELED);
+ finish_contexts(g_ceph_context, waiting_for_active, -ECANCELED);
+ finish_contexts(g_ceph_context, pending_finishers, -ECANCELED);
+ finish_contexts(g_ceph_context, committing_finishers, -ECANCELED);
+ if (logger)
+ g_ceph_context->get_perfcounters_collection()->remove(logger);
+}
+
+void Paxos::leader_init()
+{
+ cancel_events();
+ new_value.clear();
+
+ // discard pending transaction
+ pending_proposal.reset();
+
+ reset_pending_committing_finishers();
+
+ logger->inc(l_paxos_start_leader);
+
+ if (mon.get_quorum().size() == 1) {
+ state = STATE_ACTIVE;
+ return;
+ }
+
+ state = STATE_RECOVERING;
+ lease_expire = {};
+ dout(10) << "leader_init -- starting paxos recovery" << dendl;
+ collect(0);
+}
+
+void Paxos::peon_init()
+{
+ cancel_events();
+ new_value.clear();
+
+ state = STATE_RECOVERING;
+ lease_expire = {};
+ dout(10) << "peon_init -- i am a peon" << dendl;
+
+ // start a timer, in case the leader never manages to issue a lease
+ reset_lease_timeout();
+
+ // discard pending transaction
+ pending_proposal.reset();
+
+ // no chance to write now!
+ reset_pending_committing_finishers();
+ finish_contexts(g_ceph_context, waiting_for_writeable, -EAGAIN);
+
+ logger->inc(l_paxos_start_peon);
+}
+
+void Paxos::restart()
+{
+ dout(10) << "restart -- canceling timeouts" << dendl;
+ cancel_events();
+ new_value.clear();
+
+ if (is_writing() || is_writing_previous()) {
+ dout(10) << __func__ << " flushing" << dendl;
+ mon.lock.unlock();
+ mon.store->flush();
+ mon.lock.lock();
+ dout(10) << __func__ << " flushed" << dendl;
+ }
+ state = STATE_RECOVERING;
+
+ // discard pending transaction
+ pending_proposal.reset();
+
+ reset_pending_committing_finishers();
+ finish_contexts(g_ceph_context, waiting_for_active, -EAGAIN);
+
+ logger->inc(l_paxos_restart);
+}
+
+void Paxos::reset_pending_committing_finishers()
+{
+ committing_finishers.splice(committing_finishers.end(), pending_finishers);
+ finish_contexts(g_ceph_context, committing_finishers, -EAGAIN);
+}
+
+void Paxos::dispatch(MonOpRequestRef op)
+{
+ ceph_assert(op->is_type_paxos());
+ op->mark_paxos_event("dispatch");
+
+ if (op->get_req()->get_type() != MSG_MON_PAXOS) {
+ dout(0) << "Got unexpected message type " << op->get_req()->get_type()
+ << " in Paxos::dispatch, aborting!" << dendl;
+ ceph_abort();
+ }
+
+ auto *req = op->get_req<MMonPaxos>();
+
+ // election in progress?
+ if (!mon.is_leader() && !mon.is_peon()) {
+ dout(5) << "election in progress, dropping " << *req << dendl;
+ return;
+ }
+
+ // check sanity
+ ceph_assert(mon.is_leader() ||
+ (mon.is_peon() && req->get_source().num() == mon.get_leader()));
+
+ // NOTE: these ops are defined in messages/MMonPaxos.h
+ switch (req->op) {
+ // learner
+ case MMonPaxos::OP_COLLECT:
+ handle_collect(op);
+ break;
+ case MMonPaxos::OP_LAST:
+ handle_last(op);
+ break;
+ case MMonPaxos::OP_BEGIN:
+ handle_begin(op);
+ break;
+ case MMonPaxos::OP_ACCEPT:
+ handle_accept(op);
+ break;
+ case MMonPaxos::OP_COMMIT:
+ handle_commit(op);
+ break;
+ case MMonPaxos::OP_LEASE:
+ handle_lease(op);
+ break;
+ case MMonPaxos::OP_LEASE_ACK:
+ handle_lease_ack(op);
+ break;
+ default:
+ ceph_abort();
+ }
+}
+
+
+// -----------------
+// service interface
+
+// -- READ --
+
+bool Paxos::is_readable(version_t v)
+{
+ bool ret;
+ if (v > last_committed)
+ ret = false;
+ else
+ ret =
+ (mon.is_peon() || mon.is_leader()) &&
+ (is_active() || is_updating() || is_writing()) &&
+ last_committed > 0 && is_lease_valid(); // must have a value alone, or have lease
+ dout(5) << __func__ << " = " << (int)ret
+ << " - now=" << ceph_clock_now()
+ << " lease_expire=" << lease_expire
+ << " has v" << v << " lc " << last_committed
+ << dendl;
+ return ret;
+}
+
+bool Paxos::read(version_t v, bufferlist &bl)
+{
+ if (!get_store()->get(get_name(), v, bl))
+ return false;
+ return true;
+}
+
+version_t Paxos::read_current(bufferlist &bl)
+{
+ if (read(last_committed, bl))
+ return last_committed;
+ return 0;
+}
+
+
+bool Paxos::is_lease_valid()
+{
+ return ((mon.get_quorum().size() == 1)
+ || (ceph::real_clock::now() < lease_expire));
+}
+
+// -- WRITE --
+
+bool Paxos::is_writeable()
+{
+ return
+ mon.is_leader() &&
+ is_active() &&
+ is_lease_valid();
+}
+
+void Paxos::propose_pending()
+{
+ ceph_assert(is_active());
+ ceph_assert(pending_proposal);
+
+ cancel_events();
+
+ bufferlist bl;
+ pending_proposal->encode(bl);
+
+ dout(10) << __func__ << " " << (last_committed + 1)
+ << " " << bl.length() << " bytes" << dendl;
+ dout(30) << __func__ << " transaction dump:\n";
+ JSONFormatter f(true);
+ pending_proposal->dump(&f);
+ f.flush(*_dout);
+ *_dout << dendl;
+
+ pending_proposal.reset();
+
+ committing_finishers.swap(pending_finishers);
+ state = STATE_UPDATING;
+ begin(bl);
+}
+
+void Paxos::queue_pending_finisher(Context *onfinished)
+{
+ dout(5) << __func__ << " " << onfinished << dendl;
+ ceph_assert(onfinished);
+ pending_finishers.push_back(onfinished);
+}
+
+MonitorDBStore::TransactionRef Paxos::get_pending_transaction()
+{
+ ceph_assert(mon.is_leader());
+ if (!pending_proposal) {
+ pending_proposal.reset(new MonitorDBStore::Transaction);
+ ceph_assert(pending_finishers.empty());
+ }
+ return pending_proposal;
+}
+
+bool Paxos::trigger_propose()
+{
+ if (plugged) {
+ dout(10) << __func__ << " plugged, not proposing now" << dendl;
+ return false;
+ } else if (is_active()) {
+ dout(10) << __func__ << " active, proposing now" << dendl;
+ propose_pending();
+ return true;
+ } else {
+ dout(10) << __func__ << " not active, will propose later" << dendl;
+ return false;
+ }
+}
+
+bool Paxos::is_consistent()
+{
+ return (first_committed <= last_committed);
+}
+
diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h
new file mode 100644
index 000000000..c197f26f7
--- /dev/null
+++ b/src/mon/Paxos.h
@@ -0,0 +1,1384 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/*
+time---->
+
+cccccccccccccccccca????????????????????????????????????????
+cccccccccccccccccca????????????????????????????????????????
+cccccccccccccccccca???????????????????????????????????????? leader
+cccccccccccccccccc?????????????????????????????????????????
+ccccc??????????????????????????????????????????????????????
+
+last_committed
+
+pn_from
+pn
+
+a 12v
+b 12v
+c 14v
+d
+e 12v
+*/
+
+/**
+ * Paxos storage layout and behavior
+ *
+ * Currently, we use a key/value store to hold all the Paxos-related data, but
+ * it can logically be depicted as this:
+ *
+ * paxos:
+ * first_committed -> 1
+ * last_committed -> 4
+ * 1 -> value_1
+ * 2 -> value_2
+ * 3 -> value_3
+ * 4 -> value_4
+ *
+ * Since we are relying on a k/v store supporting atomic transactions, we can
+ * guarantee that if 'last_committed' has a value of '4', then we have up to
+ * version 4 on the store, and no more than that; the same applies to
+ * 'first_committed', which holding '1' will strictly meaning that our lowest
+ * version is 1.
+ *
+ * Each version's value (value_1, value_2, ..., value_n) is a blob of data,
+ * incomprehensible to the Paxos. These values are proposed to the Paxos on
+ * propose_new_value() and each one is a transaction encoded in a ceph::buffer::list.
+ *
+ * The Paxos will write the value to disk, associating it with its version,
+ * but will take a step further: the value shall be decoded, and the operations
+ * on that transaction shall be applied during the same transaction that will
+ * write the value's encoded ceph::buffer::list to disk. This behavior ensures that
+ * whatever is being proposed will only be available on the store when it is
+ * applied by Paxos, which will then be aware of such new values, guaranteeing
+ * the store state is always consistent without requiring shady workarounds.
+ *
+ * So, let's say that FooMonitor proposes the following transaction, neatly
+ * encoded on a ceph::buffer::list of course:
+ *
+ * Tx_Foo
+ * put(foo, last_committed, 3)
+ * put(foo, 3, foo_value_3)
+ * erase(foo, 2)
+ * erase(foo, 1)
+ * put(foo, first_committed, 3)
+ *
+ * And knowing that the Paxos is proposed Tx_Foo as a ceph::buffer::list, once it is
+ * ready to commit, and assuming we are now committing version 5 of the Paxos,
+ * we will do something along the lines of:
+ *
+ * Tx proposed_tx;
+ * proposed_tx.decode(Tx_foo_ceph::buffer::list);
+ *
+ * Tx our_tx;
+ * our_tx.put(paxos, last_committed, 5);
+ * our_tx.put(paxos, 5, Tx_foo_ceph::buffer::list);
+ * our_tx.append(proposed_tx);
+ *
+ * store_apply(our_tx);
+ *
+ * And the store should look like this after we apply 'our_tx':
+ *
+ * paxos:
+ * first_committed -> 1
+ * last_committed -> 5
+ * 1 -> value_1
+ * 2 -> value_2
+ * 3 -> value_3
+ * 4 -> value_4
+ * 5 -> Tx_foo_ceph::buffer::list
+ * foo:
+ * first_committed -> 3
+ * last_committed -> 3
+ * 3 -> foo_value_3
+ *
+ */
+
+#ifndef CEPH_MON_PAXOS_H
+#define CEPH_MON_PAXOS_H
+
+#include "include/types.h"
+#include "mon_types.h"
+#include "include/buffer.h"
+#include "msg/msg_types.h"
+#include "include/Context.h"
+#include "common/perf_counters.h"
+#include <errno.h>
+
+#include "MonitorDBStore.h"
+#include "mon/MonOpRequest.h"
+
+class Monitor;
+class MMonPaxos;
+
+enum {
+ l_paxos_first = 45800,
+ l_paxos_start_leader,
+ l_paxos_start_peon,
+ l_paxos_restart,
+ l_paxos_refresh,
+ l_paxos_refresh_latency,
+ l_paxos_begin,
+ l_paxos_begin_keys,
+ l_paxos_begin_bytes,
+ l_paxos_begin_latency,
+ l_paxos_commit,
+ l_paxos_commit_keys,
+ l_paxos_commit_bytes,
+ l_paxos_commit_latency,
+ l_paxos_collect,
+ l_paxos_collect_keys,
+ l_paxos_collect_bytes,
+ l_paxos_collect_latency,
+ l_paxos_collect_uncommitted,
+ l_paxos_collect_timeout,
+ l_paxos_accept_timeout,
+ l_paxos_lease_ack_timeout,
+ l_paxos_lease_timeout,
+ l_paxos_store_state,
+ l_paxos_store_state_keys,
+ l_paxos_store_state_bytes,
+ l_paxos_store_state_latency,
+ l_paxos_share_state,
+ l_paxos_share_state_keys,
+ l_paxos_share_state_bytes,
+ l_paxos_new_pn,
+ l_paxos_new_pn_latency,
+ l_paxos_last,
+};
+
+
+// i am one state machine.
+/**
+ * This library is based on the Paxos algorithm, but varies in a few key ways:
+ * 1- Only a single new value is generated at a time, simplifying the recovery logic.
+ * 2- Nodes track "committed" values, and share them generously (and trustingly)
+ * 3- A 'leasing' mechanism is built-in, allowing nodes to determine when it is
+ * safe to "read" their copy of the last committed value.
+ *
+ * This provides a simple replication substrate that services can be built on top of.
+ * See PaxosService.h
+ */
+class Paxos {
+ /**
+ * @defgroup Paxos_h_class Paxos
+ * @{
+ */
+ /**
+ * The Monitor to which this Paxos class is associated with.
+ */
+ Monitor &mon;
+
+ /// perf counter for internal instrumentations
+ PerfCounters *logger;
+
+ void init_logger();
+
+ // my state machine info
+ const std::string paxos_name;
+
+ friend class Monitor;
+ friend class PaxosService;
+
+ std::list<std::string> extra_state_dirs;
+
+ // LEADER+PEON
+
+ // -- generic state --
+public:
+ /**
+ * @defgroup Paxos_h_states States on which the leader/peon may be.
+ * @{
+ */
+ enum {
+ /**
+ * Leader/Peon is in Paxos' Recovery state
+ */
+ STATE_RECOVERING,
+ /**
+ * Leader/Peon is idle, and the Peon may or may not have a valid lease.
+ */
+ STATE_ACTIVE,
+ /**
+ * Leader/Peon is updating to a new value.
+ */
+ STATE_UPDATING,
+ /*
+ * Leader proposing an old value
+ */
+ STATE_UPDATING_PREVIOUS,
+ /*
+ * Leader/Peon is writing a new commit. readable, but not
+ * writeable.
+ */
+ STATE_WRITING,
+ /*
+ * Leader/Peon is writing a new commit from a previous round.
+ */
+ STATE_WRITING_PREVIOUS,
+ // leader: refresh following a commit
+ STATE_REFRESH,
+ // Shutdown after WRITING or WRITING_PREVIOUS
+ STATE_SHUTDOWN
+ };
+
+ /**
+ * Obtain state name from constant value.
+ *
+ * @note This function will raise a fatal error if @p s is not
+ * a valid state value.
+ *
+ * @param s State value.
+ * @return The state's name.
+ */
+ static const std::string get_statename(int s) {
+ switch (s) {
+ case STATE_RECOVERING:
+ return "recovering";
+ case STATE_ACTIVE:
+ return "active";
+ case STATE_UPDATING:
+ return "updating";
+ case STATE_UPDATING_PREVIOUS:
+ return "updating-previous";
+ case STATE_WRITING:
+ return "writing";
+ case STATE_WRITING_PREVIOUS:
+ return "writing-previous";
+ case STATE_REFRESH:
+ return "refresh";
+ case STATE_SHUTDOWN:
+ return "shutdown";
+ default:
+ return "UNKNOWN";
+ }
+ }
+
+private:
+ /**
+ * The state we are in.
+ */
+ int state;
+ /**
+ * @}
+ */
+ int commits_started = 0;
+
+ ceph::condition_variable shutdown_cond;
+
+public:
+ /**
+ * Check if we are recovering.
+ *
+ * @return 'true' if we are on the Recovering state; 'false' otherwise.
+ */
+ bool is_recovering() const { return (state == STATE_RECOVERING); }
+ /**
+ * Check if we are active.
+ *
+ * @return 'true' if we are on the Active state; 'false' otherwise.
+ */
+ bool is_active() const { return state == STATE_ACTIVE; }
+ /**
+ * Check if we are updating.
+ *
+ * @return 'true' if we are on the Updating state; 'false' otherwise.
+ */
+ bool is_updating() const { return state == STATE_UPDATING; }
+
+ /**
+ * Check if we are updating/proposing a previous value from a
+ * previous quorum
+ */
+ bool is_updating_previous() const { return state == STATE_UPDATING_PREVIOUS; }
+
+ /// @return 'true' if we are writing an update to disk
+ bool is_writing() const { return state == STATE_WRITING; }
+
+ /// @return 'true' if we are writing an update-previous to disk
+ bool is_writing_previous() const { return state == STATE_WRITING_PREVIOUS; }
+
+ /// @return 'true' if we are refreshing an update just committed
+ bool is_refresh() const { return state == STATE_REFRESH; }
+
+ /// @return 'true' if we are in the process of shutting down
+ bool is_shutdown() const { return state == STATE_SHUTDOWN; }
+
+private:
+ /**
+ * @defgroup Paxos_h_recovery_vars Common recovery-related member variables
+ * @note These variables are common to both the Leader and the Peons.
+ * @{
+ */
+ /**
+ *
+ */
+ version_t first_committed;
+ /**
+ * Last Proposal Number
+ *
+ * @todo Expand description
+ */
+ version_t last_pn;
+ /**
+ * Last committed value's version.
+ *
+ * On both the Leader and the Peons, this is the last value's version that
+ * was accepted by a given quorum and thus committed, that this instance
+ * knows about.
+ *
+ * @note It may not be the last committed value's version throughout the
+ * system. If we are a Peon, we may have not been part of the quorum
+ * that accepted the value, and for this very same reason we may still
+ * be a (couple of) version(s) behind, until we learn about the most
+ * recent version. This should only happen if we are not active (i.e.,
+ * part of the quorum), which should not happen if we are up, running
+ * and able to communicate with others -- thus able to be part of the
+ * monmap and trigger new elections.
+ */
+ version_t last_committed;
+ /**
+ * Last committed value's time.
+ *
+ * When the commit finished.
+ */
+ utime_t last_commit_time;
+ /**
+ * The last Proposal Number we have accepted.
+ *
+ * On the Leader, it will be the Proposal Number picked by the Leader
+ * itself. On the Peon, however, it will be the proposal sent by the Leader
+ * and it will only be updated if its value is higher than the one
+ * already known by the Peon.
+ */
+ version_t accepted_pn;
+ /**
+ * The last_committed epoch of the leader at the time we accepted the last pn.
+ *
+ * This has NO SEMANTIC MEANING, and is there only for the debug output.
+ */
+ version_t accepted_pn_from;
+ /**
+ * Map holding the first committed version by each quorum member.
+ *
+ * The versions kept in this map are updated during the collect phase.
+ * When the Leader starts the collect phase, each Peon will reply with its
+ * first committed version, which will then be kept in this map.
+ */
+ std::map<int,version_t> peer_first_committed;
+ /**
+ * Map holding the last committed version by each quorum member.
+ *
+ * The versions kept in this map are updated during the collect phase.
+ * When the Leader starts the collect phase, each Peon will reply with its
+ * last committed version, which will then be kept in this map.
+ */
+ std::map<int,version_t> peer_last_committed;
+ /**
+ * @}
+ */
+
+ // active (phase 2)
+ /**
+ * @defgroup Paxos_h_active_vars Common active-related member variables
+ * @{
+ */
+ /**
+ * When does our read lease expires.
+ *
+ * Instead of performing a full commit each time a read is requested, we
+ * keep leases. Each lease will have an expiration date, which may or may
+ * not be extended.
+ */
+ ceph::real_clock::time_point lease_expire;
+ /**
+ * List of callbacks waiting for our state to change into STATE_ACTIVE.
+ */
+ std::list<Context*> waiting_for_active;
+ /**
+ * List of callbacks waiting for the chance to read a version from us.
+ *
+ * Each entry on the list may result from an attempt to read a version that
+ * wasn't available at the time, or an attempt made during a period during
+ * which we could not satisfy the read request. The first case happens if
+ * the requested version is greater than our last committed version. The
+ * second scenario may happen if we are recovering, or if we don't have a
+ * valid lease.
+ *
+ * The list will be woken up once we change to STATE_ACTIVE with an extended
+ * lease -- which can be achieved if we have everyone on the quorum on board
+ * with the latest proposal, or if we don't really care about the remaining
+ * uncommitted values --, or if we're on a quorum of one.
+ */
+ std::list<Context*> waiting_for_readable;
+ /**
+ * @}
+ */
+
+ // -- leader --
+ // recovery (paxos phase 1)
+ /**
+ * @defgroup Paxos_h_leader_recovery Leader-specific Recovery-related vars
+ * @{
+ */
+ /**
+ * Number of replies to the collect phase we've received so far.
+ *
+ * This variable is reset to 1 each time we start a collect phase; it is
+ * incremented each time we receive a reply to the collect message, and
+ * is used to determine whether or not we have received replies from the
+ * whole quorum.
+ */
+ unsigned num_last;
+ /**
+ * Uncommitted value's version.
+ *
+ * If we have, or end up knowing about, an uncommitted value, then its
+ * version will be kept in this variable.
+ *
+ * @note If this version equals @p last_committed+1 when we reach the final
+ * steps of recovery, then the algorithm will assume this is a value
+ * the Leader does not know about, and trustingly the Leader will
+ * propose this version's value.
+ */
+ version_t uncommitted_v;
+ /**
+ * Uncommitted value's Proposal Number.
+ *
+ * We use this variable to assess if the Leader should take into consideration
+ * an uncommitted value sent by a Peon. Given that the Peon will send back to
+ * the Leader the last Proposal Number it accepted, the Leader will be able
+ * to infer if this value is more recent than the one the Leader has, thus
+ * more relevant.
+ */
+ version_t uncommitted_pn;
+ /**
+ * Uncommitted Value.
+ *
+ * If the system fails in-between the accept replies from the Peons and the
+ * instruction to commit from the Leader, then we may end up with accepted
+ * but yet-uncommitted values. During the Leader's recovery, it will attempt
+ * to bring the whole system to the latest state, and that means committing
+ * past accepted but uncommitted values.
+ *
+ * This variable will hold an uncommitted value, which may originate either
+ * on the Leader, or learnt by the Leader from a Peon during the collect
+ * phase.
+ */
+ ceph::buffer::list uncommitted_value;
+ /**
+ * Used to specify when an on-going collect phase times out.
+ */
+ Context *collect_timeout_event;
+ /**
+ * @}
+ */
+
+ // active
+ /**
+ * @defgroup Paxos_h_leader_active Leader-specific Active-related vars
+ * @{
+ */
+ /**
+ * Set of participants (Leader & Peons) that have acked a lease extension.
+ *
+ * Each Peon that acknowledges a lease extension will have its place in this
+ * set, which will be used to account for all the acks from all the quorum
+ * members, guaranteeing that we trigger new elections if some don't ack in
+ * the expected timeframe.
+ */
+ std::set<int> acked_lease;
+ /**
+ * Callback responsible for extending the lease periodically.
+ */
+ Context *lease_renew_event;
+ /**
+ * Callback to trigger new elections once the time for acks is out.
+ */
+ Context *lease_ack_timeout_event;
+ /**
+ * @}
+ */
+ /**
+ * @defgroup Paxos_h_peon_active Peon-specific Active-related vars
+ * @{
+ */
+ /**
+ * Callback to trigger new elections when the Peon's lease times out.
+ *
+ * If the Peon's lease is extended, this callback will be reset (i.e.,
+ * we cancel the event and reschedule a new one with starting from the
+ * beginning).
+ */
+ Context *lease_timeout_event;
+ /**
+ * @}
+ */
+
+ // updating (paxos phase 2)
+ /**
+ * @defgroup Paxos_h_leader_updating Leader-specific Updating-related vars
+ * @{
+ */
+ /**
+ * New Value being proposed to the Peons.
+ *
+ * This ceph::buffer::list holds the value the Leader is proposing to the Peons, and
+ * that will be committed if the Peons do accept the proposal.
+ */
+ ceph::buffer::list new_value;
+ /**
+ * Set of participants (Leader & Peons) that accepted the new proposed value.
+ *
+ * This set is used to keep track of those who have accepted the proposed
+ * value, so the leader may know when to issue a commit (when a majority of
+ * participants has accepted the proposal), and when to extend the lease
+ * (when all the quorum members have accepted the proposal).
+ */
+ std::set<int> accepted;
+ /**
+ * Callback to trigger a new election if the proposal is not accepted by the
+ * full quorum within a given timeframe.
+ *
+ * If the full quorum does not accept the proposal, then it means that the
+ * Leader may no longer be recognized as the leader, or that the quorum has
+ * changed, and the value may have not reached all the participants. Thus,
+ * the leader must call new elections, and go through a recovery phase in
+ * order to propagate the new value throughout the system.
+ *
+ * This does not mean that we won't commit. We will commit as soon as we
+ * have a majority of acceptances. But if we do not have full acceptance
+ * from the quorum, then we cannot extend the lease, as some participants
+ * may not have the latest committed value.
+ */
+ Context *accept_timeout_event;
+
+ /**
+ * List of callbacks waiting for it to be possible to write again.
+ *
+ * @remarks It is not possible to write if we are not the Leader, or we are
+ * not on the active state, or if the lease has expired.
+ */
+ std::list<Context*> waiting_for_writeable;
+
+ /**
+ * Pending proposal transaction
+ *
+ * This is the transaction that is under construction and pending
+ * proposal. We will add operations to it until we decide it is
+ * time to start a paxos round.
+ */
+ MonitorDBStore::TransactionRef pending_proposal;
+
+ /**
+ * Finishers for pending transaction
+ *
+ * These are waiting for updates in the pending proposal/transaction
+ * to be committed.
+ */
+ std::list<Context*> pending_finishers;
+
+ /**
+ * Finishers for committing transaction
+ *
+ * When the pending_proposal is submitted, pending_finishers move to
+ * this list. When it commits, these finishers are notified.
+ */
+ std::list<Context*> committing_finishers;
+ /**
+ * This function re-triggers pending_ and committing_finishers
+ * safely, so as to maintain existing system invariants. In particular
+ * we maintain ordering by triggering committing before pending, and
+ * we clear out pending_finishers prior to any triggers so that
+ * we don't trigger asserts on them being empty. You should
+ * use it instead of sending -EAGAIN to them with finish_contexts.
+ */
+ void reset_pending_committing_finishers();
+
+ /**
+ * @defgroup Paxos_h_sync_warns Synchronization warnings
+ * @todo Describe these variables
+ * @{
+ */
+ utime_t last_clock_drift_warn;
+ int clock_drift_warned;
+ /**
+ * @}
+ */
+
+ /**
+ * Should be true if we have proposed to trim, or are in the middle of
+ * trimming; false otherwise.
+ */
+ bool trimming;
+
+ /**
+ * true if we want trigger_propose to *not* propose (yet)
+ */
+ bool plugged = false;
+
+ /**
+ * @defgroup Paxos_h_callbacks Callback classes.
+ * @{
+ */
+ /**
+ * Callback class responsible for handling a Collect Timeout.
+ */
+ class C_CollectTimeout;
+ /**
+ * Callback class responsible for handling an Accept Timeout.
+ */
+ class C_AcceptTimeout;
+ /**
+ * Callback class responsible for handling a Lease Ack Timeout.
+ */
+ class C_LeaseAckTimeout;
+
+ /**
+ * Callback class responsible for handling a Lease Timeout.
+ */
+ class C_LeaseTimeout;
+
+ /**
+ * Callback class responsible for handling a Lease Renew Timeout.
+ */
+ class C_LeaseRenew;
+
+ class C_Trimmed;
+ /**
+ *
+ */
+public:
+ class C_Proposal : public Context {
+ Context *proposer_context;
+ public:
+ ceph::buffer::list bl;
+ // for debug purposes. Will go away. Soon.
+ bool proposed;
+ utime_t proposal_time;
+
+ C_Proposal(Context *c, ceph::buffer::list& proposal_bl) :
+ proposer_context(c),
+ bl(proposal_bl),
+ proposed(false),
+ proposal_time(ceph_clock_now())
+ { }
+
+ void finish(int r) override {
+ if (proposer_context) {
+ proposer_context->complete(r);
+ proposer_context = NULL;
+ }
+ }
+ };
+ /**
+ * @}
+ */
+private:
+ /**
+ * @defgroup Paxos_h_election_triggered Steps triggered by an election.
+ *
+ * @note All these functions play a significant role in the Recovery Phase,
+ * which is triggered right after an election once someone becomes
+ * the Leader.
+ * @{
+ */
+ /**
+ * Create a new Proposal Number and propose it to the Peons.
+ *
+ * This function starts the Recovery Phase, which can be directly mapped
+ * onto the original Paxos' Prepare phase. Basically, we'll generate a
+ * Proposal Number, taking @p oldpn into consideration, and we will send
+ * it to a quorum, along with our first and last committed versions. By
+ * sending these information in a message to the quorum, we expect to
+ * obtain acceptances from a majority, allowing us to commit, or be
+ * informed of a higher Proposal Number known by one or more of the Peons
+ * in the quorum.
+ *
+ * @pre We are the Leader.
+ * @post Recovery Phase initiated by sending messages to the quorum.
+ *
+ * @param oldpn A proposal number taken as the highest known so far, that
+ * should be taken into consideration when generating a new
+ * Proposal Number for the Recovery Phase.
+ */
+ void collect(version_t oldpn);
+ /**
+ * Handle the reception of a collect message from the Leader and reply
+ * accordingly.
+ *
+ * Once a Peon receives a collect message from the Leader it will reply
+ * with its first and last committed versions, as well as information so
+ * the Leader may know if its Proposal Number was, or was not, accepted by
+ * the Peon. The Peon will accept the Leader's Proposal Number if it is
+ * higher than the Peon's currently accepted Proposal Number. The Peon may
+ * also inform the Leader of accepted but uncommitted values.
+ *
+ * @invariant The message is an operation of type OP_COLLECT.
+ * @pre We are a Peon.
+ * @post Replied to the Leader, accepting or not accepting its PN.
+ *
+ * @param collect The collect message sent by the Leader to the Peon.
+ */
+ void handle_collect(MonOpRequestRef op);
+ /**
+ * Handle a response from a Peon to the Leader's collect phase.
+ *
+ * The received message will state the Peon's last committed version, as
+ * well as its last proposal number. This will lead to one of the following
+ * scenarios: if the replied Proposal Number is equal to the one we proposed,
+ * then the Peon has accepted our proposal, and if all the Peons do accept
+ * our Proposal Number, then we are allowed to proceed with the commit;
+ * however, if a Peon replies with a higher Proposal Number, we assume he
+ * knows something we don't and the Leader will have to abort the current
+ * proposal in order to retry with the Proposal Number specified by the Peon.
+ * It may also occur that the Peon replied with a lower Proposal Number, in
+ * which case we assume it is a reply to an older value and we'll simply
+ * drop it.
+ * This function will also check if the Peon replied with an accepted but
+ * yet uncommitted value. In this case, if its version is higher than our
+ * last committed value by one, we assume that the Peon knows a value from a
+ * previous proposal that has never been committed, and we should try to
+ * commit that value by proposing it next. On the other hand, if that is
+ * not the case, we'll assume it is an old, uncommitted value, we do not
+ * care about and we'll consider the system active by extending the leases.
+ *
+ * @invariant The message is an operation of type OP_LAST.
+ * @pre We are the Leader.
+ * @post We initiate a commit, or we retry with a higher Proposal Number,
+ * or we drop the message.
+ * @post We move from STATE_RECOVERING to STATE_ACTIVE.
+ *
+ * @param last The message sent by the Peon to the Leader.
+ */
+ void handle_last(MonOpRequestRef op);
+ /**
+ * The Recovery Phase timed out, meaning that a significant part of the
+ * quorum does not believe we are the Leader, and we thus should trigger new
+ * elections.
+ *
+ * @pre We believe to be the Leader.
+ * @post Trigger new elections.
+ */
+ void collect_timeout();
+ /**
+ * @}
+ */
+
+ /**
+ * @defgroup Paxos_h_updating_funcs Functions used during the Updating State
+ *
+ * These functions may easily be mapped to the original Paxos Algorithm's
+ * phases.
+ *
+ * Taking into account the algorithm can be divided in 4 phases (Prepare,
+ * Promise, Accept Request and Accepted), we can easily map Paxos::begin to
+ * both the Prepare and Accept Request phases; the Paxos::handle_begin to
+ * the Promise phase; and the Paxos::handle_accept to the Accepted phase.
+ * @{
+ */
+ /**
+ * Start a new proposal with the intent of committing @p value.
+ *
+ * If we are alone on the system (i.e., a quorum of one), then we will
+ * simply commit the value, but if we are not alone, then we need to propose
+ * the value to the quorum.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_ACTIVE
+ * @post We commit, if we are alone, or we send a message to each quorum
+ * member
+ * @post We are on STATE_ACTIVE, if we are alone, or on
+ * STATE_UPDATING otherwise
+ *
+ * @param value The value being proposed to the quorum
+ */
+ void begin(ceph::buffer::list& value);
+ /**
+ * Accept or decline (by ignoring) a proposal from the Leader.
+ *
+ * We will decline the proposal (by ignoring it) if we have promised to
+ * accept a higher numbered proposal. If that is not the case, we will
+ * accept it and accordingly reply to the Leader.
+ *
+ * @pre We are a Peon
+ * @pre We are on STATE_ACTIVE
+ * @post We are on STATE_UPDATING if we accept the Leader's proposal
+ * @post We send a reply message to the Leader if we accept its proposal
+ *
+ * @invariant The received message is an operation of type OP_BEGIN
+ *
+ * @param begin The message sent by the Leader to the Peon during the
+ * Paxos::begin function
+ *
+ */
+ void handle_begin(MonOpRequestRef op);
+ /**
+ * Handle an Accept message sent by a Peon.
+ *
+ * In order to commit, the Leader has to receive accepts from a majority of
+ * the quorum. If that does happen, then the Leader may proceed with the
+ * commit. However, the Leader needs the accepts from all the quorum members
+ * in order to extend the lease and move on to STATE_ACTIVE.
+ *
+ * This function handles these two situations, accounting for the amount of
+ * received accepts.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_UPDATING
+ * @post We are on STATE_ACTIVE if we received accepts from the full quorum
+ * @post We extended the lease if we moved on to STATE_ACTIVE
+ * @post We are on STATE_UPDATING if we didn't received accepts from the
+ * full quorum
+ * @post We have committed if we received accepts from a majority
+ *
+ * @invariant The received message is an operation of type OP_ACCEPT
+ *
+ * @param accept The message sent by the Peons to the Leader during the
+ * Paxos::handle_begin function
+ */
+ void handle_accept(MonOpRequestRef op);
+ /**
+ * Trigger a fresh election.
+ *
+ * During Paxos::begin we set a Callback of type Paxos::C_AcceptTimeout in
+ * order to limit the amount of time we spend waiting for Accept replies.
+ * This callback will call Paxos::accept_timeout when it is fired.
+ *
+ * This is essential to the algorithm because there may be the chance that
+ * we are no longer the Leader (i.e., others don't believe in us) and we
+ * are getting ignored, or we dropped out of the quorum and haven't realised
+ * it. So, our only option is to trigger fresh elections.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_UPDATING
+ * @post Triggered fresh elections
+ */
+ void accept_timeout();
+ /**
+ * @}
+ */
+
+
+ utime_t commit_start_stamp;
+ friend struct C_Committed;
+
+ /**
+ * Commit a value throughout the system.
+ *
+ * The Leader will cancel the current lease (as it was for the old value),
+ * and will store the committed value locally. It will then instruct every
+ * quorum member to do so as well.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_UPDATING
+ * @pre A majority of quorum members accepted our proposal
+ * @post Value locally stored
+ * @post Quorum members instructed to commit the new value.
+ */
+ void commit_start();
+ void commit_finish(); ///< finish a commit after txn becomes durable
+ void abort_commit(); ///< Handle commit finish after shutdown started
+ /**
+ * Commit the new value to stable storage as being the latest available
+ * version.
+ *
+ * @pre We are a Peon
+ * @post The new value is locally stored
+ * @post Fire up the callbacks waiting on waiting_for_commit
+ *
+ * @invariant The received message is an operation of type OP_COMMIT
+ *
+ * @param commit The message sent by the Leader to the Peon during
+ * Paxos::commit
+ */
+ void handle_commit(MonOpRequestRef op);
+ /**
+ * Extend the system's lease.
+ *
+ * This means that the Leader considers that it should now safe to read from
+ * any node on the system, since every quorum member is now in possession of
+ * the latest version. Therefore, the Leader will send a message stating just
+ * this to each quorum member, and will impose a limited timeframe during
+ * which acks will be accepted. If there aren't as many acks as expected
+ * (i.e, if at least one quorum member does not ack the lease) during this
+ * timeframe, then we will force fresh elections.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_ACTIVE
+ * @post A message extending the lease is sent to each quorum member
+ * @post A timeout callback is set to limit the amount of time we will wait
+ * for lease acks.
+ * @post A timer is set in order to renew the lease after a certain amount
+ * of time.
+ */
+ void extend_lease();
+ /**
+ * Update the lease on the Peon's side of things.
+ *
+ * Once a Peon receives a Lease message, it will update its lease_expire
+ * variable, reply to the Leader acknowledging the lease update and set a
+ * timeout callback to be fired upon the lease's expiration. Finally, the
+ * Peon will fire up all the callbacks waiting for it to become active,
+ * which it just did, and all those waiting for it to become readable,
+ * which should be true if the Peon's lease didn't expire in the mean time.
+ *
+ * @pre We are a Peon
+ * @post We update the lease accordingly
+ * @post A lease timeout callback is set
+ * @post Move to STATE_ACTIVE
+ * @post Fire up all the callbacks waiting for STATE_ACTIVE
+ * @post Fire up all the callbacks waiting for readable if we are readable
+ * @post Ack the lease to the Leader
+ *
+ * @invariant The received message is an operation of type OP_LEASE
+ *
+ * @param lease The message sent by the Leader to the Peon during the
+ * Paxos::extend_lease function
+ */
+ void handle_lease(MonOpRequestRef op);
+ /**
+ * Account for all the Lease Acks the Leader receives from the Peons.
+ *
+ * Once the Leader receives all the Lease Acks from the Peons, it will be
+ * able to cancel the Lease Ack timeout callback, thus avoiding calling
+ * fresh elections.
+ *
+ * @pre We are the Leader
+ * @post Cancel the Lease Ack timeout callback if we receive acks from all
+ * the quorum members
+ *
+ * @invariant The received message is an operation of type OP_LEASE_ACK
+ *
+ * @param ack The message sent by a Peon to the Leader during the
+ * Paxos::handle_lease function
+ */
+ void handle_lease_ack(MonOpRequestRef op);
+ /**
+ * Call fresh elections because at least one Peon didn't acked our lease.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_ACTIVE
+ * @post Trigger fresh elections
+ */
+ void lease_ack_timeout();
+ /**
+ * Extend lease since we haven't had new committed values meanwhile.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_ACTIVE
+ * @post Go through with Paxos::extend_lease
+ */
+ void lease_renew_timeout();
+ /**
+ * Call fresh elections because the Peon's lease expired without being
+ * renewed or receiving a fresh lease.
+ *
+ * This means that the Peon is no longer assumed as being in the quorum
+ * (or there is no Leader to speak of), so just trigger fresh elections
+ * to circumvent this issue.
+ *
+ * @pre We are a Peon
+ * @post Trigger fresh elections
+ */
+ void lease_timeout(); // on peon, if lease isn't extended
+
+ /// restart the lease timeout timer
+ void reset_lease_timeout();
+
+ /**
+ * Cancel all of Paxos' timeout/renew events.
+ */
+ void cancel_events();
+ /**
+ * Shutdown this Paxos machine
+ */
+ void shutdown();
+
+ /**
+ * Generate a new Proposal Number based on @p gt
+ *
+ * @todo Check what @p gt actually means and what its usage entails
+ * @param gt A hint for the geration of the Proposal Number
+ * @return A globally unique, monotonically increasing Proposal Number
+ */
+ version_t get_new_proposal_number(version_t gt=0);
+
+ /**
+ * @todo document sync function
+ */
+ void warn_on_future_time(utime_t t, entity_name_t from);
+
+ /**
+ * Begin proposing the pending_proposal.
+ */
+ void propose_pending();
+
+ /**
+ * refresh state from store
+ *
+ * Called when we have new state for the mon to consume. If we return false,
+ * abort (we triggered a bootstrap).
+ *
+ * @returns true on success, false if we are now bootstrapping
+ */
+ bool do_refresh();
+
+ void commit_proposal();
+ void finish_round();
+
+public:
+ /**
+ * @param m A monitor
+ * @param name A name for the paxos service. It serves as the naming space
+ * of the underlying persistent storage for this service.
+ */
+ Paxos(Monitor &m, const std::string &name)
+ : mon(m),
+ logger(NULL),
+ paxos_name(name),
+ state(STATE_RECOVERING),
+ first_committed(0),
+ last_pn(0),
+ last_committed(0),
+ accepted_pn(0),
+ accepted_pn_from(0),
+ num_last(0),
+ uncommitted_v(0), uncommitted_pn(0),
+ collect_timeout_event(0),
+ lease_renew_event(0),
+ lease_ack_timeout_event(0),
+ lease_timeout_event(0),
+ accept_timeout_event(0),
+ clock_drift_warned(0),
+ trimming(false) { }
+
+ ~Paxos() {
+ delete logger;
+ }
+
+ const std::string get_name() const {
+ return paxos_name;
+ }
+
+ void dispatch(MonOpRequestRef op);
+
+ void read_and_prepare_transactions(MonitorDBStore::TransactionRef tx,
+ version_t from, version_t last);
+
+ void init();
+
+ /**
+ * dump state info to a formatter
+ */
+ void dump_info(ceph::Formatter *f);
+
+ /**
+ * This function runs basic consistency checks. Importantly, if
+ * it is inconsistent and shouldn't be, it asserts out.
+ *
+ * @return True if consistent, false if not.
+ */
+ bool is_consistent();
+
+ void restart();
+ /**
+ * Initiate the Leader after it wins an election.
+ *
+ * Once an election is won, the Leader will be initiated and there are two
+ * possible outcomes of this method: the Leader directly jumps to the active
+ * state (STATE_ACTIVE) if it believes to be the only one in the quorum, or
+ * will start recovering (STATE_RECOVERING) by initiating the collect phase.
+ *
+ * @pre Our monitor is the Leader.
+ * @post We are either on STATE_ACTIVE if we're the only one in the quorum,
+ * or on STATE_RECOVERING otherwise.
+ */
+ void leader_init();
+ /**
+ * Initiate a Peon after it loses an election.
+ *
+ * If we are a Peon, then there must be a Leader and we are not alone in the
+ * quorum, thus automatically assume we are on STATE_RECOVERING, which means
+ * we will soon be enrolled into the Leader's collect phase.
+ *
+ * @pre There is a Leader, and it?s about to start the collect phase.
+ * @post We are on STATE_RECOVERING and will soon receive collect phase's
+ * messages.
+ */
+ void peon_init();
+
+ /**
+ * Include an incremental state of values, ranging from peer_first_committed
+ * to the last committed value, on the message m
+ *
+ * @param m A message
+ * @param peer_first_committed Lowest version to take into account
+ * @param peer_last_committed Highest version to take into account
+ */
+ void share_state(MMonPaxos *m, version_t peer_first_committed,
+ version_t peer_last_committed);
+ /**
+ * Store on disk a state that was shared with us
+ *
+ * Basically, we received a set of version. Or just one. It doesn't matter.
+ * What matters is that we have to stash it in the store. So, we will simply
+ * write every single ceph::buffer::list into their own versions on our side (i.e.,
+ * onto paxos-related keys), and then we will decode those same ceph::buffer::lists
+ * we just wrote and apply the transactions they hold. We will also update
+ * our first and last committed values to point to the new values, if need
+ * be. All this is done tightly wrapped in a transaction to ensure we
+ * enjoy the atomicity guarantees given by our awesome k/v store.
+ *
+ * @param m A message
+ * @returns true if we stored something new; false otherwise
+ */
+ bool store_state(MMonPaxos *m);
+ void _sanity_check_store();
+
+ /**
+ * Helper function to decode a ceph::buffer::list into a transaction and append it
+ * to another transaction.
+ *
+ * This function is used during the Leader's commit and during the
+ * Paxos::store_state in order to apply the ceph::buffer::list's transaction onto
+ * the store.
+ *
+ * @param t The transaction to which we will append the operations
+ * @param bl A ceph::buffer::list containing an encoded transaction
+ */
+ static void decode_append_transaction(MonitorDBStore::TransactionRef t,
+ ceph::buffer::list& bl) {
+ auto vt(std::make_shared<MonitorDBStore::Transaction>());
+ auto it = bl.cbegin();
+ vt->decode(it);
+ t->append(vt);
+ }
+
+ /**
+ * @todo This appears to be used only by the OSDMonitor, and I would say
+ * its objective is to allow a third-party to have a "private"
+ * state dir. -JL
+ */
+ void add_extra_state_dir(std::string s) {
+ extra_state_dirs.push_back(s);
+ }
+
+ // -- service interface --
+ /**
+ * Add c to the list of callbacks waiting for us to become active.
+ *
+ * @param c A callback
+ */
+ void wait_for_active(MonOpRequestRef op, Context *c) {
+ if (op)
+ op->mark_event("paxos:wait_for_active");
+ waiting_for_active.push_back(c);
+ }
+ void wait_for_active(Context *c) {
+ MonOpRequestRef o;
+ wait_for_active(o, c);
+ }
+
+ /**
+ * Trim the Paxos state as much as we can.
+ */
+ void trim();
+
+ /**
+ * Check if we should trim.
+ *
+ * If trimming is disabled, we must take that into consideration and only
+ * return true if we are positively sure that we should trim soon.
+ *
+ * @returns true if we should trim; false otherwise.
+ */
+ bool should_trim() {
+ int available_versions = get_version() - get_first_committed();
+ int maximum_versions = g_conf()->paxos_min + g_conf()->paxos_trim_min;
+
+ if (trimming || (available_versions <= maximum_versions))
+ return false;
+
+ return true;
+ }
+
+ bool is_plugged() const {
+ return plugged;
+ }
+ void plug() {
+ ceph_assert(plugged == false);
+ plugged = true;
+ }
+ void unplug() {
+ ceph_assert(plugged == true);
+ plugged = false;
+ }
+
+ // read
+ /**
+ * @defgroup Paxos_h_read_funcs Read-related functions
+ * @{
+ */
+ /**
+ * Get latest committed version
+ *
+ * @return latest committed version
+ */
+ version_t get_version() { return last_committed; }
+ /**
+ * Get first committed version
+ *
+ * @return the first committed version
+ */
+ version_t get_first_committed() { return first_committed; }
+ /**
+ * Check if a given version is readable.
+ *
+ * A version may not be readable for a myriad of reasons:
+ * @li the version @e v is higher that the last committed version
+ * @li we are not the Leader nor a Peon (election may be on-going)
+ * @li we do not have a committed value yet
+ * @li we do not have a valid lease
+ *
+ * @param seen The version we want to check if it is readable.
+ * @return 'true' if the version is readable; 'false' otherwise.
+ */
+ bool is_readable(version_t seen=0);
+ /**
+ * Read version @e v and store its value in @e bl
+ *
+ * @param[in] v The version we want to read
+ * @param[out] bl The version's value
+ * @return 'true' if we successfully read the value; 'false' otherwise
+ */
+ bool read(version_t v, ceph::buffer::list &bl);
+ /**
+ * Read the latest committed version
+ *
+ * @param[out] bl The version's value
+ * @return the latest committed version if we successfully read the value;
+ * or 0 (zero) otherwise.
+ */
+ version_t read_current(ceph::buffer::list &bl);
+ /**
+ * Add onreadable to the list of callbacks waiting for us to become readable.
+ *
+ * @param onreadable A callback
+ */
+ void wait_for_readable(MonOpRequestRef op, Context *onreadable) {
+ ceph_assert(!is_readable());
+ if (op)
+ op->mark_event("paxos:wait_for_readable");
+ waiting_for_readable.push_back(onreadable);
+ }
+ void wait_for_readable(Context *onreadable) {
+ MonOpRequestRef o;
+ wait_for_readable(o, onreadable);
+ }
+ /**
+ * @}
+ */
+
+ /**
+ * Check if we have a valid lease.
+ *
+ * @returns true if the lease is still valid; false otherwise.
+ */
+ bool is_lease_valid();
+ // write
+ /**
+ * @defgroup Paxos_h_write_funcs Write-related functions
+ * @{
+ */
+ /**
+ * Check if we are writeable.
+ *
+ * We are writeable if we are alone (i.e., a quorum of one), or if we match
+ * all the following conditions:
+ * @li We are the Leader
+ * @li We are on STATE_ACTIVE
+ * @li We have a valid lease
+ *
+ * @return 'true' if we are writeable; 'false' otherwise.
+ */
+ bool is_writeable();
+ /**
+ * Add c to the list of callbacks waiting for us to become writeable.
+ *
+ * @param c A callback
+ */
+ void wait_for_writeable(MonOpRequestRef op, Context *c) {
+ ceph_assert(!is_writeable());
+ if (op)
+ op->mark_event("paxos:wait_for_writeable");
+ waiting_for_writeable.push_back(c);
+ }
+ void wait_for_writeable(Context *c) {
+ MonOpRequestRef o;
+ wait_for_writeable(o, c);
+ }
+
+ /**
+ * Get a transaction to submit operations to propose against
+ *
+ * Apply operations to this transaction. It will eventually be proposed
+ * to paxos.
+ */
+ MonitorDBStore::TransactionRef get_pending_transaction();
+
+ /**
+ * Queue a completion for the pending proposal
+ *
+ * This completion will get triggered when the pending proposal
+ * transaction commits.
+ */
+ void queue_pending_finisher(Context *onfinished);
+
+ /**
+ * (try to) trigger a proposal
+ *
+ * Tell paxos that it should submit the pending proposal. Note that if it
+ * is not active (e.g., because it is already in the midst of committing
+ * something) that will be deferred (e.g., until the current round finishes).
+ */
+ bool trigger_propose();
+ /**
+ * @}
+ */
+
+ /**
+ * @}
+ */
+ protected:
+ MonitorDBStore *get_store();
+};
+
+inline std::ostream& operator<<(std::ostream& out, Paxos::C_Proposal& p)
+{
+ std::string proposed = (p.proposed ? "proposed" : "unproposed");
+ out << " " << proposed
+ << " queued " << (ceph_clock_now() - p.proposal_time)
+ << " tx dump:\n";
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ auto p_it = p.bl.cbegin();
+ t->decode(p_it);
+ ceph::JSONFormatter f(true);
+ t->dump(&f);
+ f.flush(out);
+ return out;
+}
+
+#endif
diff --git a/src/mon/PaxosFSMap.h b/src/mon/PaxosFSMap.h
new file mode 100644
index 000000000..4312d7e1f
--- /dev/null
+++ b/src/mon/PaxosFSMap.h
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_PAXOS_FSMAP_H
+#define CEPH_PAXOS_FSMAP_H
+
+#include <chrono>
+
+#include "mds/FSMap.h"
+#include "mds/MDSMap.h"
+
+#include "include/ceph_assert.h"
+
+class PaxosFSMap {
+public:
+ virtual ~PaxosFSMap() {}
+
+ const FSMap &get_pending_fsmap() const { ceph_assert(is_leader()); return pending_fsmap; }
+ const FSMap &get_fsmap() const { return fsmap; }
+
+ virtual bool is_leader() const = 0;
+
+protected:
+ FSMap &get_pending_fsmap_writeable() { ceph_assert(is_leader()); return pending_fsmap; }
+
+ FSMap &create_pending() {
+ ceph_assert(is_leader());
+ pending_fsmap = fsmap;
+ pending_fsmap.inc_epoch();
+ return pending_fsmap;
+ }
+
+ void prune_fsmap_history() {
+ auto now = real_clock::now();
+ for (auto it = history.begin(); it != history.end(); ) {
+ auto since = now - it->second.get_btime();
+ /* Be sure to not make the map empty */
+ auto itnext = std::next(it);
+ if (itnext == history.end()) {
+ break;
+ }
+ /* Keep the map just before the prune time threshold:
+ * [ e-1 (lifetime > history_prune_time) | e (lifetime 1s) ]
+ * If an mds was removed in (e), then we want to be able to say it was
+ * last seen 1 second ago.
+ */
+ auto since2 = now - itnext->second.get_btime();
+ if (since > history_prune_time && since2 > history_prune_time) {
+ it = history.erase(it);
+ } else {
+ break;
+ }
+ }
+ }
+
+ void put_fsmap_history(const FSMap& _fsmap) {
+ auto now = real_clock::now();
+ auto since = now - _fsmap.get_btime();
+ if (since < history_prune_time) {
+ history.emplace(std::piecewise_construct, std::forward_as_tuple(_fsmap.get_epoch()), std::forward_as_tuple(_fsmap));
+ }
+ }
+
+ void set_fsmap_history_threshold(std::chrono::seconds t) {
+ history_prune_time = t;
+ }
+ std::chrono::seconds get_fsmap_history_threshold() const {
+ return history_prune_time;
+ }
+
+ const auto& get_fsmap_history() const {
+ return history;
+ }
+
+ void decode(ceph::buffer::list &bl) {
+ fsmap.decode(bl);
+ put_fsmap_history(fsmap);
+ pending_fsmap = FSMap(); /* nuke it to catch invalid access */
+ }
+
+private:
+ /* Keep these PRIVATE to prevent unprotected manipulation. */
+ std::map<epoch_t, FSMap> history;
+ std::chrono::seconds history_prune_time = std::chrono::seconds(0);
+ FSMap fsmap; /* the current epoch */
+ FSMap pending_fsmap; /* the next epoch */
+};
+
+
+#endif
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
new file mode 100644
index 000000000..ce2c5977f
--- /dev/null
+++ b/src/mon/PaxosService.cc
@@ -0,0 +1,476 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "PaxosService.h"
+#include "common/Clock.h"
+#include "common/config.h"
+#include "include/stringify.h"
+#include "include/ceph_assert.h"
+#include "mon/MonOpRequest.h"
+
+using std::ostream;
+using std::string;
+
+using ceph::bufferlist;
+
+#define dout_subsys ceph_subsys_paxos
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, paxos, service_name, get_first_committed(), get_last_committed())
+static ostream& _prefix(std::ostream *_dout, Monitor &mon, Paxos &paxos, string service_name,
+ version_t fc, version_t lc) {
+ return *_dout << "mon." << mon.name << "@" << mon.rank
+ << "(" << mon.get_state_name()
+ << ").paxosservice(" << service_name << " " << fc << ".." << lc << ") ";
+}
+
+bool PaxosService::dispatch(MonOpRequestRef op)
+{
+ ceph_assert(op->is_type_service() || op->is_type_command());
+ auto m = op->get_req<PaxosServiceMessage>();
+ op->mark_event("psvc:dispatch");
+
+ dout(10) << __func__ << " " << m << " " << *m
+ << " from " << m->get_orig_source_inst()
+ << " con " << m->get_connection() << dendl;
+
+ if (mon.is_shutdown()) {
+ return true;
+ }
+
+ // make sure this message isn't forwarded from a previous election epoch
+ if (m->rx_election_epoch &&
+ m->rx_election_epoch < mon.get_epoch()) {
+ dout(10) << " discarding forwarded message from previous election epoch "
+ << m->rx_election_epoch << " < " << mon.get_epoch() << dendl;
+ return true;
+ }
+
+ // make sure the client is still connected. note that a proxied
+ // connection will be disconnected with a null message; don't drop
+ // those. also ignore loopback (e.g., log) messages.
+ if (m->get_connection() &&
+ !m->get_connection()->is_connected() &&
+ m->get_connection() != mon.con_self &&
+ m->get_connection()->get_messenger() != NULL) {
+ dout(10) << " discarding message from disconnected client "
+ << m->get_source_inst() << " " << *m << dendl;
+ return true;
+ }
+
+ // make sure our map is readable and up to date
+ if (!is_readable(m->version)) {
+ dout(10) << " waiting for paxos -> readable (v" << m->version << ")" << dendl;
+ wait_for_readable(op, new C_RetryMessage(this, op), m->version);
+ return true;
+ }
+
+ // preprocess
+ if (preprocess_query(op))
+ return true; // easy!
+
+ // leader?
+ if (!mon.is_leader()) {
+ mon.forward_request_leader(op);
+ return true;
+ }
+
+ // writeable?
+ if (!is_writeable()) {
+ dout(10) << " waiting for paxos -> writeable" << dendl;
+ wait_for_writeable(op, new C_RetryMessage(this, op));
+ return true;
+ }
+
+ // update
+ if (!prepare_update(op)) {
+ // no changes made.
+ return true;
+ }
+
+ if (need_immediate_propose) {
+ dout(10) << __func__ << " forced immediate propose" << dendl;
+ propose_pending();
+ return true;
+ }
+
+ double delay = 0.0;
+ if (!should_propose(delay)) {
+ dout(10) << " not proposing" << dendl;
+ return true;
+ }
+
+ if (delay == 0.0) {
+ propose_pending();
+ return true;
+ }
+
+ // delay a bit
+ if (!proposal_timer) {
+ /**
+ * Callback class used to propose the pending value once the proposal_timer
+ * fires up.
+ */
+ auto do_propose = new C_MonContext{&mon, [this](int r) {
+ proposal_timer = 0;
+ if (r >= 0) {
+ propose_pending();
+ } else if (r == -ECANCELED || r == -EAGAIN) {
+ return;
+ } else {
+ ceph_abort_msg("bad return value for proposal_timer");
+ }
+ }};
+ dout(10) << " setting proposal_timer " << do_propose
+ << " with delay of " << delay << dendl;
+ proposal_timer = mon.timer.add_event_after(delay, do_propose);
+ } else {
+ dout(10) << " proposal_timer already set" << dendl;
+ }
+ return true;
+}
+
+void PaxosService::refresh(bool *need_bootstrap)
+{
+ dout(10) << __func__ << dendl;
+
+ // update cached versions
+ auto first_committed = mon.store->get(get_service_name(), first_committed_name);
+ auto last_committed = mon.store->get(get_service_name(), last_committed_name);
+ if (last_committed > cached_last_committed) {
+ finish_contexts(g_ceph_context, waiting_for_commit, 0);
+ }
+ cached_first_committed = first_committed;
+ cached_last_committed = last_committed;
+
+ version_t new_format = get_value("format_version");
+ if (new_format != format_version) {
+ dout(1) << __func__ << " upgraded, format " << format_version << " -> " << new_format << dendl;
+ on_upgrade();
+ }
+ format_version = new_format;
+
+
+ update_from_paxos(need_bootstrap);
+}
+
+void PaxosService::post_refresh()
+{
+ dout(10) << __func__ << dendl;
+
+ post_paxos_update();
+
+ if (mon.is_peon()) {
+ finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);
+ finish_contexts(g_ceph_context, waiting_for_commit, -EAGAIN);
+ }
+}
+
+bool PaxosService::should_propose(double& delay)
+{
+ // simple default policy: quick startup, then some damping.
+ if (get_last_committed() <= 1) {
+ delay = 0.0;
+ } else {
+ utime_t now = ceph_clock_now();
+ if ((now - paxos.last_commit_time) > g_conf()->paxos_propose_interval)
+ delay = (double)g_conf()->paxos_min_wait;
+ else
+ delay = (double)(g_conf()->paxos_propose_interval + paxos.last_commit_time
+ - now);
+ }
+ return true;
+}
+
+
+void PaxosService::propose_pending()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(have_pending);
+ ceph_assert(!proposing);
+ ceph_assert(mon.is_leader());
+ ceph_assert(is_active());
+
+ if (proposal_timer) {
+ dout(10) << " canceling proposal_timer " << proposal_timer << dendl;
+ mon.timer.cancel_event(proposal_timer);
+ proposal_timer = NULL;
+ }
+
+ /**
+ * @note What we contribute to the pending Paxos transaction is
+ * obtained by calling a function that must be implemented by
+ * the class implementing us. I.e., the function
+ * encode_pending will be the one responsible to encode
+ * whatever is pending on the implementation class into a
+ * bufferlist, so we can then propose that as a value through
+ * Paxos.
+ */
+ MonitorDBStore::TransactionRef t = paxos.get_pending_transaction();
+
+ if (should_stash_full())
+ encode_full(t);
+
+ encode_pending(t);
+ have_pending = false;
+
+ if (format_version > 0) {
+ t->put(get_service_name(), "format_version", format_version);
+ }
+
+ // apply to paxos
+ proposing = true;
+ need_immediate_propose = false; /* reset whenever we propose */
+ /**
+ * Callback class used to mark us as active once a proposal finishes going
+ * through Paxos.
+ *
+ * We should wake people up *only* *after* we inform the service we
+ * just went active. And we should wake people up only once we finish
+ * going active. This is why we first go active, avoiding to wake up the
+ * wrong people at the wrong time, such as waking up a C_RetryMessage
+ * before waking up a C_Active, thus ending up without a pending value.
+ */
+ class C_Committed : public Context {
+ PaxosService *ps;
+ public:
+ explicit C_Committed(PaxosService *p) : ps(p) { }
+ void finish(int r) override {
+ ps->proposing = false;
+ if (r >= 0)
+ ps->_active();
+ else if (r == -ECANCELED || r == -EAGAIN)
+ return;
+ else
+ ceph_abort_msg("bad return value for C_Committed");
+ }
+ };
+ paxos.queue_pending_finisher(new C_Committed(this));
+ paxos.trigger_propose();
+}
+
+bool PaxosService::should_stash_full()
+{
+ version_t latest_full = get_version_latest_full();
+ /* @note The first member of the condition is moot and it is here just for
+ * clarity's sake. The second member would end up returing true
+ * nonetheless because, in that event,
+ * latest_full == get_trim_to() == 0.
+ */
+ return (!latest_full ||
+ (latest_full <= get_trim_to()) ||
+ (get_last_committed() - latest_full > (version_t)g_conf()->paxos_stash_full_interval));
+}
+
+void PaxosService::restart()
+{
+ dout(10) << __func__ << dendl;
+ if (proposal_timer) {
+ dout(10) << " canceling proposal_timer " << proposal_timer << dendl;
+ mon.timer.cancel_event(proposal_timer);
+ proposal_timer = 0;
+ }
+
+ finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);
+ finish_contexts(g_ceph_context, waiting_for_commit, -EAGAIN);
+
+ if (have_pending) {
+ discard_pending();
+ have_pending = false;
+ }
+ proposing = false;
+
+ on_restart();
+}
+
+void PaxosService::election_finished()
+{
+ dout(10) << __func__ << dendl;
+
+ finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);
+ finish_contexts(g_ceph_context, waiting_for_commit, -EAGAIN);
+
+ // make sure we update our state
+ _active();
+}
+
+void PaxosService::_active()
+{
+ if (is_proposing()) {
+ dout(10) << __func__ << " - proposing" << dendl;
+ return;
+ }
+ if (!is_active()) {
+ dout(10) << __func__ << " - not active" << dendl;
+ /**
+ * Callback used to make sure we call the PaxosService::_active function
+ * whenever a condition is fulfilled.
+ *
+ * This is used in multiple situations, from waiting for the Paxos to commit
+ * our proposed value, to waiting for the Paxos to become active once an
+ * election is finished.
+ */
+ class C_Active : public Context {
+ PaxosService *svc;
+ public:
+ explicit C_Active(PaxosService *s) : svc(s) {}
+ void finish(int r) override {
+ if (r >= 0)
+ svc->_active();
+ }
+ };
+ wait_for_active_ctx(new C_Active(this));
+ return;
+ }
+ dout(10) << __func__ << dendl;
+
+ // create pending state?
+ if (mon.is_leader()) {
+ dout(7) << __func__ << " creating new pending" << dendl;
+ if (!have_pending) {
+ create_pending();
+ have_pending = true;
+ }
+
+ if (get_last_committed() == 0) {
+ // create initial state
+ create_initial();
+ propose_pending();
+ return;
+ }
+ } else {
+ dout(7) << __func__ << " we are not the leader, hence we propose nothing!" << dendl;
+ }
+
+ // wake up anyone who came in while we were proposing. note that
+ // anyone waiting for the previous proposal to commit is no longer
+ // on this list; it is on Paxos's.
+ finish_contexts(g_ceph_context, waiting_for_finished_proposal, 0);
+
+ if (mon.is_leader())
+ upgrade_format();
+
+ // NOTE: it's possible that this will get called twice if we commit
+ // an old paxos value. Implementations should be mindful of that.
+ on_active();
+}
+
+
+void PaxosService::shutdown()
+{
+ cancel_events();
+
+ if (proposal_timer) {
+ dout(10) << " canceling proposal_timer " << proposal_timer << dendl;
+ mon.timer.cancel_event(proposal_timer);
+ proposal_timer = 0;
+ }
+
+ finish_contexts(g_ceph_context, waiting_for_commit, -EAGAIN);
+ finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);
+
+ on_shutdown();
+}
+
+void PaxosService::maybe_trim()
+{
+ if (!is_writeable())
+ return;
+
+ const version_t first_committed = get_first_committed();
+ version_t trim_to = get_trim_to();
+ dout(20) << __func__ << " " << first_committed << "~" << trim_to << dendl;
+
+ if (trim_to < first_committed) {
+ dout(10) << __func__ << " trim_to " << trim_to << " < first_committed "
+ << first_committed << dendl;
+ return;
+ }
+
+ version_t to_remove = trim_to - first_committed;
+ const version_t trim_min = g_conf().get_val<version_t>("paxos_service_trim_min");
+ if (trim_min > 0 &&
+ to_remove < trim_min) {
+ dout(10) << __func__ << " trim_to " << trim_to << " would only trim " << to_remove
+ << " < paxos_service_trim_min " << trim_min << dendl;
+ return;
+ }
+
+ to_remove = [to_remove, trim_to, this] {
+ const version_t trim_max = g_conf().get_val<version_t>("paxos_service_trim_max");
+ if (trim_max == 0 || to_remove < trim_max) {
+ return to_remove;
+ }
+ if (to_remove < trim_max * 1.5) {
+ dout(10) << __func__ << " trim to " << trim_to << " would only trim " << to_remove
+ << " > paxos_service_trim_max, limiting to " << trim_max
+ << dendl;
+ return trim_max;
+ }
+ const version_t new_trim_max = (trim_max + to_remove) / 2;
+ const uint64_t trim_max_multiplier = g_conf().get_val<uint64_t>("paxos_service_trim_max_multiplier");
+ if (trim_max_multiplier) {
+ return std::min(new_trim_max, trim_max * trim_max_multiplier);
+ } else {
+ return new_trim_max;
+ }
+ }();
+ trim_to = first_committed + to_remove;
+
+ dout(10) << __func__ << " trimming to " << trim_to << ", " << to_remove << " states" << dendl;
+ MonitorDBStore::TransactionRef t = paxos.get_pending_transaction();
+ trim(t, first_committed, trim_to);
+ put_first_committed(t, trim_to);
+ cached_first_committed = trim_to;
+
+ // let the service add any extra stuff
+ encode_trim_extra(t, trim_to);
+
+ paxos.trigger_propose();
+}
+
+void PaxosService::trim(MonitorDBStore::TransactionRef t,
+ version_t from, version_t to)
+{
+ dout(10) << __func__ << " from " << from << " to " << to << dendl;
+ ceph_assert(from != to);
+
+ for (version_t v = from; v < to; ++v) {
+ dout(20) << __func__ << " " << v << dendl;
+ t->erase(get_service_name(), v);
+
+ string full_key = mon.store->combine_strings("full", v);
+ if (mon.store->exists(get_service_name(), full_key)) {
+ dout(20) << __func__ << " " << full_key << dendl;
+ t->erase(get_service_name(), full_key);
+ }
+ }
+ if (g_conf()->mon_compact_on_trim) {
+ dout(20) << " compacting prefix " << get_service_name() << dendl;
+ t->compact_range(get_service_name(), stringify(from - 1), stringify(to));
+ t->compact_range(get_service_name(),
+ mon.store->combine_strings(full_prefix_name, from - 1),
+ mon.store->combine_strings(full_prefix_name, to));
+ }
+}
+
+void PaxosService::load_health()
+{
+ bufferlist bl;
+ mon.store->get("health", service_name, bl);
+ if (bl.length()) {
+ auto p = bl.cbegin();
+ using ceph::decode;
+ decode(health_checks, p);
+ }
+}
diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h
new file mode 100644
index 000000000..cf149ae69
--- /dev/null
+++ b/src/mon/PaxosService.h
@@ -0,0 +1,916 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_PAXOSSERVICE_H
+#define CEPH_PAXOSSERVICE_H
+
+#include "include/Context.h"
+#include "Paxos.h"
+#include "Monitor.h"
+#include "MonitorDBStore.h"
+
+/**
+ * A Paxos Service is an abstraction that easily allows one to obtain an
+ * association between a Monitor and a Paxos class, in order to implement any
+ * service.
+ */
+class PaxosService {
+ /**
+ * @defgroup PaxosService_h_class Paxos Service
+ * @{
+ */
+ public:
+ /**
+ * The Monitor to which this class is associated with
+ */
+ Monitor &mon;
+ /**
+ * The Paxos instance to which this class is associated with
+ */
+ Paxos &paxos;
+ /**
+ * Our name. This will be associated with the class implementing us, and will
+ * be used mainly for store-related operations.
+ */
+ std::string service_name;
+ /**
+ * If we are or have queued anything for proposal, this variable will be true
+ * until our proposal has been finished.
+ */
+ bool proposing;
+
+ bool need_immediate_propose = false;
+
+protected:
+ /**
+ * Services implementing us used to depend on the Paxos version, back when
+ * each service would have a Paxos instance for itself. However, now we only
+ * have a single Paxos instance, shared by all the services. Each service now
+ * must keep its own version, if so they wish. This variable should be used
+ * for that purpose.
+ */
+ version_t service_version;
+
+ private:
+ /**
+ * Event callback responsible for proposing our pending value once a timer
+ * runs out and fires.
+ */
+ Context *proposal_timer;
+ /**
+ * If the implementation class has anything pending to be proposed to Paxos,
+ * then have_pending should be true; otherwise, false.
+ */
+ bool have_pending;
+
+ /**
+ * health checks for this service
+ *
+ * Child must populate this during encode_pending() by calling encode_health().
+ */
+ health_check_map_t health_checks;
+protected:
+ /**
+ * format of our state in RocksDB, 0 for default
+ */
+ version_t format_version;
+
+public:
+ const health_check_map_t& get_health_checks() const {
+ return health_checks;
+ }
+
+ /**
+ * @defgroup PaxosService_h_callbacks Callback classes
+ * @{
+ */
+ /**
+ * Retry dispatching a given service message
+ *
+ * This callback class is used when we had to wait for some condition to
+ * become true while we were dispatching it.
+ *
+ * For instance, if the message's version isn't readable, according to Paxos,
+ * then we must wait for it to become readable. So, we just queue an
+ * instance of this class onto the Paxos::wait_for_readable function, and
+ * we will retry the whole dispatch again once the callback is fired.
+ */
+ class C_RetryMessage : public C_MonOp {
+ PaxosService *svc;
+ public:
+ C_RetryMessage(PaxosService *s, MonOpRequestRef op_) :
+ C_MonOp(op_), svc(s) { }
+ void _finish(int r) override {
+ if (r == -EAGAIN || r >= 0)
+ svc->dispatch(op);
+ else if (r == -ECANCELED)
+ return;
+ else
+ ceph_abort_msg("bad C_RetryMessage return value");
+ }
+ };
+
+ class C_ReplyOp : public C_MonOp {
+ Monitor &mon;
+ MonOpRequestRef op;
+ MessageRef reply;
+ public:
+ C_ReplyOp(PaxosService *s, MonOpRequestRef o, MessageRef r) :
+ C_MonOp(o), mon(s->mon), op(o), reply(r) { }
+ void _finish(int r) override {
+ if (r >= 0) {
+ mon.send_reply(op, reply.detach());
+ }
+ }
+ };
+
+ /**
+ * @}
+ */
+
+ /**
+ * @param mn A Monitor instance
+ * @param p A Paxos instance
+ * @param name Our service's name.
+ */
+ PaxosService(Monitor &mn, Paxos &p, std::string name)
+ : mon(mn), paxos(p), service_name(name),
+ proposing(false),
+ service_version(0), proposal_timer(0), have_pending(false),
+ format_version(0),
+ last_committed_name("last_committed"),
+ first_committed_name("first_committed"),
+ full_prefix_name("full"), full_latest_name("latest"),
+ cached_first_committed(0), cached_last_committed(0)
+ {
+ }
+
+ virtual ~PaxosService() {}
+
+ /**
+ * Get the service's name.
+ *
+ * @returns The service's name.
+ */
+ const std::string& get_service_name() const { return service_name; }
+
+ /**
+ * Get the store prefixes we utilize
+ */
+ virtual void get_store_prefixes(std::set<std::string>& s) const {
+ s.insert(service_name);
+ }
+
+ // i implement and you ignore
+ /**
+ * Informs this instance that it should consider itself restarted.
+ *
+ * This means that we will cancel our proposal_timer event, if any exists.
+ */
+ void restart();
+ /**
+ * Informs this instance that an election has finished.
+ *
+ * This means that we will invoke a PaxosService::discard_pending while
+ * setting have_pending to false (basically, ignore our pending state) and
+ * we will then make sure we obtain a new state.
+ *
+ * Our state shall be updated by PaxosService::_active if the Paxos is
+ * active; otherwise, we will wait for it to become active by adding a
+ * PaxosService::C_Active callback to it.
+ */
+ void election_finished();
+ /**
+ * Informs this instance that it is supposed to shutdown.
+ *
+ * Basically, it will instruct Paxos to cancel all events/callbacks and then
+ * will cancel the proposal_timer event if any exists.
+ */
+ void shutdown();
+
+private:
+ /**
+ * Update our state by updating it from Paxos, and then creating a new
+ * pending state if need be.
+ *
+ * @remarks We only create a pending state we our Monitor is the Leader.
+ *
+ * @pre Paxos is active
+ * @post have_pending is true if our Monitor is the Leader and Paxos is
+ * active
+ */
+ void _active();
+
+public:
+ /**
+ * Propose a new value through Paxos.
+ *
+ * This function should be called by the classes implementing
+ * PaxosService, in order to propose a new value through Paxos.
+ *
+ * @pre The implementation class implements the encode_pending function.
+ * @pre have_pending is true
+ * @pre Our monitor is the Leader
+ * @pre Paxos is active
+ * @post Cancel the proposal timer, if any
+ * @post have_pending is false
+ * @post propose pending value through Paxos
+ *
+ * @note This function depends on the implementation of encode_pending on
+ * the class that is implementing PaxosService
+ */
+ void propose_pending();
+
+ /**
+ * Let others request us to propose.
+ *
+ * At the moment, this is just a wrapper to propose_pending() with an
+ * extra check for is_writeable(), but it's a good practice to dissociate
+ * requests for proposals from direct usage of propose_pending() for
+ * future use -- we might want to perform additional checks or put a
+ * request on hold, for instance.
+ */
+ void request_proposal() {
+ ceph_assert(is_writeable());
+
+ propose_pending();
+ }
+ /**
+ * Request service @p other to perform a proposal.
+ *
+ * We could simply use the function above, requesting @p other directly,
+ * but we might eventually want to do something to the request -- say,
+ * set a flag stating we're waiting on a cross-proposal to be finished.
+ */
+ void request_proposal(PaxosService *other) {
+ ceph_assert(other != NULL);
+ ceph_assert(other->is_writeable());
+
+ other->request_proposal();
+ }
+
+ /**
+ * Dispatch a message by passing it to several different functions that are
+ * either implemented directly by this service, or that should be implemented
+ * by the class implementing this service.
+ *
+ * @param m A message
+ * @returns 'true' on successful dispatch; 'false' otherwise.
+ */
+ bool dispatch(MonOpRequestRef op);
+
+ void refresh(bool *need_bootstrap);
+ void post_refresh();
+
+ /**
+ * @defgroup PaxosService_h_override_funcs Functions that should be
+ * overridden.
+ *
+ * These functions should be overridden at will by the class implementing
+ * this service.
+ * @{
+ */
+ /**
+ * Create the initial state for your system.
+ *
+ * In some of ours the state is actually set up elsewhere so this does
+ * nothing.
+ */
+ virtual void create_initial() = 0;
+
+ /**
+ * Query the Paxos system for the latest state and apply it if it's newer
+ * than the current Monitor state.
+ */
+ virtual void update_from_paxos(bool *need_bootstrap) = 0;
+
+ /**
+ * Hook called after all services have refreshed their state from paxos
+ *
+ * This is useful for doing any update work that depends on other
+ * service's having up-to-date state.
+ */
+ virtual void post_paxos_update() {}
+
+ /**
+ * Init on startup
+ *
+ * This is called on mon startup, after all of the PaxosService instances'
+ * update_from_paxos() methods have been called
+ */
+ virtual void init() {}
+
+ /**
+ * Create the pending state.
+ *
+ * @invariant This function is only called on a Leader.
+ * @remarks This created state is then modified by incoming messages.
+ * @remarks Called at startup and after every Paxos ratification round.
+ */
+ virtual void create_pending() = 0;
+
+ /**
+ * Encode the pending state into a ceph::buffer::list for ratification and
+ * transmission as the next state.
+ *
+ * @invariant This function is only called on a Leader.
+ *
+ * @param t The transaction to hold all changes.
+ */
+ virtual void encode_pending(MonitorDBStore::TransactionRef t) = 0;
+
+ /**
+ * Discard the pending state
+ *
+ * @invariant This function is only called on a Leader.
+ *
+ * @remarks This function is NOT overridden in any of our code, but it is
+ * called in PaxosService::election_finished if have_pending is
+ * true.
+ */
+ virtual void discard_pending() { }
+
+ /**
+ * Look at the query; if the query can be handled without changing state,
+ * do so.
+ *
+ * @param m A query message
+ * @returns 'true' if the query was handled (e.g., was a read that got
+ * answered, was a state change that has no effect); 'false'
+ * otherwise.
+ */
+ virtual bool preprocess_query(MonOpRequestRef op) = 0;
+
+ /**
+ * Apply the message to the pending state.
+ *
+ * @invariant This function is only called on a Leader.
+ *
+ * @param m An update message
+ * @returns 'true' if the pending state should be proposed; 'false' otherwise.
+ */
+ virtual bool prepare_update(MonOpRequestRef op) = 0;
+ /**
+ * @}
+ */
+
+ /**
+ * Determine if the Paxos system should vote on pending, and if so how long
+ * it should wait to vote.
+ *
+ * @param[out] delay The wait time, used so we can limit the update traffic
+ * spamming.
+ * @returns 'true' if the Paxos system should propose; 'false' otherwise.
+ */
+ virtual bool should_propose(double &delay);
+
+ /**
+ * force an immediate propose.
+ *
+ * This is meant to be called from prepare_update(op).
+ */
+ void force_immediate_propose() {
+ need_immediate_propose = true;
+ }
+
+ /**
+ * @defgroup PaxosService_h_courtesy Courtesy functions
+ *
+ * Courtesy functions, in case the class implementing this service has
+ * anything it wants/needs to do at these times.
+ * @{
+ */
+ /**
+ * This is called when the Paxos state goes to active.
+ *
+ * On the peon, this is after each election.
+ * On the leader, this is after each election, *and* after each completed
+ * proposal.
+ *
+ * @note This function may get called twice in certain recovery cases.
+ */
+ virtual void on_active() { }
+
+ /**
+ * This is called when we are shutting down
+ */
+ virtual void on_shutdown() {}
+
+ /**
+ * this is called when activating on the leader
+ *
+ * it should conditionally upgrade the on-disk format by proposing a transaction
+ */
+ virtual void upgrade_format() { }
+
+ /**
+ * this is called when we detect the store has just upgraded underneath us
+ */
+ virtual void on_upgrade() {}
+
+ /**
+ * Called when the Paxos system enters a Leader election.
+ *
+ * @remarks It's a courtesy method, in case the class implementing this
+ * service has anything it wants/needs to do at that time.
+ */
+ virtual void on_restart() { }
+ /**
+ * @}
+ */
+
+ /**
+ * Tick.
+ */
+ virtual void tick() {}
+
+ void encode_health(const health_check_map_t& next,
+ MonitorDBStore::TransactionRef t) {
+ using ceph::encode;
+ ceph::buffer::list bl;
+ encode(next, bl);
+ t->put("health", service_name, bl);
+ mon.log_health(next, health_checks, t);
+ }
+ void load_health();
+
+ /**
+ * @defgroup PaxosService_h_store_keys Set of keys that are usually used on
+ * all the services implementing this
+ * class, and, being almost the only keys
+ * used, should be standardized to avoid
+ * mistakes.
+ * @{
+ */
+ const std::string last_committed_name;
+ const std::string first_committed_name;
+ const std::string full_prefix_name;
+ const std::string full_latest_name;
+ /**
+ * @}
+ */
+
+ private:
+ /**
+ * @defgroup PaxosService_h_version_cache Variables holding cached values
+ * for the most used versions (first
+ * and last committed); we only have
+ * to read them when the store is
+ * updated, so in-between updates we
+ * may very well use cached versions
+ * and avoid the overhead.
+ * @{
+ */
+ version_t cached_first_committed;
+ version_t cached_last_committed;
+ /**
+ * @}
+ */
+
+ /**
+ * Callback list to be used for waiting for the next proposal to commit.
+ */
+ std::vector<Context*> waiting_for_commit;
+
+ /**
+ * Callback list to be used whenever we are running a proposal through
+ * Paxos. These callbacks will be awaken whenever the said proposal
+ * finishes **and** the PaxosService is active.
+ */
+ std::vector<Context*> waiting_for_finished_proposal;
+
+ public:
+
+ /**
+ * Check if we are proposing a value through Paxos
+ *
+ * @returns true if we are proposing; false otherwise.
+ */
+ bool is_proposing() const {
+ return proposing;
+ }
+
+ /**
+ * Check if we are in the Paxos ACTIVE state.
+ *
+ * @note This function is a wrapper for Paxos::is_active
+ *
+ * @returns true if in state ACTIVE; false otherwise.
+ */
+ bool is_active() const {
+ return
+ !is_proposing() &&
+ (paxos.is_active() || paxos.is_updating() || paxos.is_writing());
+ }
+
+ /**
+ * Check if we are readable.
+ *
+ * This mirrors on the paxos check, except that we also verify that
+ *
+ * - the client hasn't seen the future relative to this PaxosService
+ * - this service isn't proposing.
+ * - we have committed our initial state (last_committed > 0)
+ *
+ * @param ver The version we want to check if is readable
+ * @returns true if it is readable; false otherwise
+ */
+ bool is_readable(version_t ver = 0) const {
+ if (ver > get_last_committed() ||
+ !paxos.is_readable(0) ||
+ get_last_committed() == 0)
+ return false;
+ return true;
+ }
+
+ /**
+ * Check if we are writeable.
+ *
+ * We consider to be writeable iff:
+ *
+ * - we are not proposing a new version;
+ * - we are ready to be written to -- i.e., we have a pending value.
+ * - paxos is (active or updating or writing or refresh)
+ *
+ * @returns true if writeable; false otherwise
+ */
+ bool is_writeable() const {
+ return is_active() && have_pending;
+ }
+
+ /**
+ * Wait for a proposal to commit.
+ *
+ * Note: the proposal may not be signaled yet. This simply adds a context to
+ * be completed when the next proposal commits.
+ *
+ * @param c The callback to be awaken once the proposal is committed.
+ */
+ void wait_for_commit(MonOpRequestRef op, Context *c) {
+ if (op)
+ op->mark_event(service_name + ":wait_for_commit");
+ waiting_for_commit.push_back(c);
+ }
+
+ /**
+ * Wait for a proposal to finish and PaxosService to become active.
+ *
+ * Add a callback to be awaken whenever our current proposal finishes being
+ * proposed through Paxos.
+ *
+ * @param c The callback to be awaken once the proposal is finished.
+ */
+ void wait_for_finished_proposal(MonOpRequestRef op, Context *c) {
+ if (op)
+ op->mark_event(service_name + ":wait_for_finished_proposal");
+ waiting_for_finished_proposal.push_back(c);
+ }
+
+
+ /**
+ * Wait for us to become active
+ *
+ * @param c The callback to be awaken once we become active.
+ */
+ void wait_for_active(MonOpRequestRef op, Context *c) {
+ if (op)
+ op->mark_event(service_name + ":wait_for_active");
+
+ if (!is_proposing()) {
+ paxos.wait_for_active(op, c);
+ return;
+ }
+ wait_for_finished_proposal(op, c);
+ }
+ void wait_for_active_ctx(Context *c) {
+ MonOpRequestRef o;
+ wait_for_active(o, c);
+ }
+
+ /**
+ * Wait for us to become readable
+ *
+ * @param c The callback to be awaken once we become active.
+ * @param ver The version we want to wait on.
+ */
+ void wait_for_readable(MonOpRequestRef op, Context *c, version_t ver = 0) {
+ /* This is somewhat of a hack. We only do check if a version is readable on
+ * PaxosService::dispatch(), but, nonetheless, we must make sure that if that
+ * is why we are not readable, then we must wait on PaxosService and not on
+ * Paxos; otherwise, we may assert on Paxos::wait_for_readable() if it
+ * happens to be readable at that specific point in time.
+ */
+ if (op)
+ op->mark_event(service_name + ":wait_for_readable");
+
+ if (is_proposing() ||
+ ver > get_last_committed() ||
+ get_last_committed() == 0)
+ wait_for_finished_proposal(op, c);
+ else {
+ if (op)
+ op->mark_event(service_name + ":wait_for_readable/paxos");
+
+ paxos.wait_for_readable(op, c);
+ }
+ }
+
+ void wait_for_readable_ctx(Context *c, version_t ver = 0) {
+ MonOpRequestRef o; // will initialize the shared_ptr to NULL
+ wait_for_readable(o, c, ver);
+ }
+
+ /**
+ * Wait for us to become writeable
+ *
+ * @param c The callback to be awaken once we become writeable.
+ */
+ void wait_for_writeable(MonOpRequestRef op, Context *c) {
+ if (op)
+ op->mark_event(service_name + ":wait_for_writeable");
+
+ if (is_proposing())
+ wait_for_finished_proposal(op, c);
+ else if (!is_writeable())
+ wait_for_active(op, c);
+ else
+ paxos.wait_for_writeable(op, c);
+ }
+ void wait_for_writeable_ctx(Context *c) {
+ MonOpRequestRef o;
+ wait_for_writeable(o, c);
+ }
+
+
+ /**
+ * @defgroup PaxosService_h_Trim Functions for trimming states
+ * @{
+ */
+ /**
+ * trim service states if appropriate
+ *
+ * Called at same interval as tick()
+ */
+ void maybe_trim();
+
+ /**
+ * Auxiliary function to trim our state from version @p from to version
+ * @p to, not including; i.e., the interval [from, to[
+ *
+ * @param t The transaction to which we will add the trim operations.
+ * @param from the lower limit of the interval to be trimmed
+ * @param to the upper limit of the interval to be trimmed (not including)
+ */
+ void trim(MonitorDBStore::TransactionRef t, version_t from, version_t to);
+
+ /**
+ * encode service-specific extra bits into trim transaction
+ *
+ * @param tx transaction
+ * @param first new first_committed value
+ */
+ virtual void encode_trim_extra(MonitorDBStore::TransactionRef tx,
+ version_t first) {}
+
+ /**
+ * Get the version we should trim to.
+ *
+ * Should be overloaded by service if it wants to trim states.
+ *
+ * @returns the version we should trim to; if we return zero, it should be
+ * assumed that there's no version to trim to.
+ */
+ virtual version_t get_trim_to() const {
+ return 0;
+ }
+
+ /**
+ * @}
+ */
+ /**
+ * @defgroup PaxosService_h_Stash_Full
+ * @{
+ */
+ virtual bool should_stash_full();
+ /**
+ * Encode a full version on @p t
+ *
+ * @note We force every service to implement this function, since we strongly
+ * desire the encoding of full versions.
+ * @note Services that do not trim their state, will be bound to only create
+ * one full version. Full version stashing is determined/controlled by
+ * trimming: we stash a version each time a trim is bound to erase the
+ * latest full version.
+ *
+ * @param t Transaction on which the full version shall be encoded.
+ */
+ virtual void encode_full(MonitorDBStore::TransactionRef t) = 0;
+
+ /**
+ * @}
+ */
+
+ /**
+ * Cancel events.
+ *
+ * @note This function is a wrapper for Paxos::cancel_events
+ */
+ void cancel_events() {
+ paxos.cancel_events();
+ }
+
+ /**
+ * @defgroup PaxosService_h_store_funcs Back storage interface functions
+ * @{
+ */
+ /**
+ * @defgroup PaxosService_h_store_modify Wrapper function interface to access
+ * the back store for modification
+ * purposes
+ * @{
+ */
+ void put_first_committed(MonitorDBStore::TransactionRef t, version_t ver) {
+ t->put(get_service_name(), first_committed_name, ver);
+ }
+ /**
+ * Set the last committed version to @p ver
+ *
+ * @param t A transaction to which we add this put operation
+ * @param ver The last committed version number being put
+ */
+ void put_last_committed(MonitorDBStore::TransactionRef t, version_t ver) {
+ t->put(get_service_name(), last_committed_name, ver);
+
+ /* We only need to do this once, and that is when we are about to make our
+ * first proposal. There are some services that rely on first_committed
+ * being set -- and it should! -- so we need to guarantee that it is,
+ * specially because the services itself do not do it themselves. They do
+ * rely on it, but they expect us to deal with it, and so we shall.
+ */
+ if (!get_first_committed())
+ put_first_committed(t, ver);
+ }
+ /**
+ * Put the contents of @p bl into version @p ver
+ *
+ * @param t A transaction to which we will add this put operation
+ * @param ver The version to which we will add the value
+ * @param bl A ceph::buffer::list containing the version's value
+ */
+ void put_version(MonitorDBStore::TransactionRef t, version_t ver,
+ ceph::buffer::list& bl) {
+ t->put(get_service_name(), ver, bl);
+ }
+ /**
+ * Put the contents of @p bl into a full version key for this service, that
+ * will be created with @p ver in mind.
+ *
+ * @param t The transaction to which we will add this put operation
+ * @param ver A version number
+ * @param bl A ceph::buffer::list containing the version's value
+ */
+ void put_version_full(MonitorDBStore::TransactionRef t,
+ version_t ver, ceph::buffer::list& bl) {
+ std::string key = mon.store->combine_strings(full_prefix_name, ver);
+ t->put(get_service_name(), key, bl);
+ }
+ /**
+ * Put the version number in @p ver into the key pointing to the latest full
+ * version of this service.
+ *
+ * @param t The transaction to which we will add this put operation
+ * @param ver A version number
+ */
+ void put_version_latest_full(MonitorDBStore::TransactionRef t, version_t ver) {
+ std::string key = mon.store->combine_strings(full_prefix_name, full_latest_name);
+ t->put(get_service_name(), key, ver);
+ }
+ /**
+ * Put the contents of @p bl into the key @p key.
+ *
+ * @param t A transaction to which we will add this put operation
+ * @param key The key to which we will add the value
+ * @param bl A ceph::buffer::list containing the value
+ */
+ void put_value(MonitorDBStore::TransactionRef t,
+ const std::string& key, ceph::buffer::list& bl) {
+ t->put(get_service_name(), key, bl);
+ }
+
+ /**
+ * Put integer value @v into the key @p key.
+ *
+ * @param t A transaction to which we will add this put operation
+ * @param key The key to which we will add the value
+ * @param v An integer
+ */
+ void put_value(MonitorDBStore::TransactionRef t,
+ const std::string& key, version_t v) {
+ t->put(get_service_name(), key, v);
+ }
+
+ /**
+ * @}
+ */
+
+ /**
+ * @defgroup PaxosService_h_store_get Wrapper function interface to access
+ * the back store for reading purposes
+ * @{
+ */
+
+ /**
+ * @defgroup PaxosService_h_version_cache Obtain cached versions for this
+ * service.
+ * @{
+ */
+ /**
+ * Get the first committed version
+ *
+ * @returns Our first committed version (that is available)
+ */
+ version_t get_first_committed() const{
+ return cached_first_committed;
+ }
+ /**
+ * Get the last committed version
+ *
+ * @returns Our last committed version
+ */
+ version_t get_last_committed() const{
+ return cached_last_committed;
+ }
+
+ /**
+ * @}
+ */
+
+ /**
+ * Get the contents of a given version @p ver
+ *
+ * @param ver The version being obtained
+ * @param bl The ceph::buffer::list to be populated
+ * @return 0 on success; <0 otherwise
+ */
+ virtual int get_version(version_t ver, ceph::buffer::list& bl) {
+ return mon.store->get(get_service_name(), ver, bl);
+ }
+ /**
+ * Get the contents of a given full version of this service.
+ *
+ * @param ver A version number
+ * @param bl The ceph::buffer::list to be populated
+ * @returns 0 on success; <0 otherwise
+ */
+ virtual int get_version_full(version_t ver, ceph::buffer::list& bl) {
+ std::string key = mon.store->combine_strings(full_prefix_name, ver);
+ return mon.store->get(get_service_name(), key, bl);
+ }
+ /**
+ * Get the latest full version number
+ *
+ * @returns A version number
+ */
+ version_t get_version_latest_full() {
+ std::string key = mon.store->combine_strings(full_prefix_name, full_latest_name);
+ return mon.store->get(get_service_name(), key);
+ }
+
+ /**
+ * Get a value from a given key.
+ *
+ * @param[in] key The key
+ * @param[out] bl The ceph::buffer::list to be populated with the value
+ */
+ int get_value(const std::string& key, ceph::buffer::list& bl) {
+ return mon.store->get(get_service_name(), key, bl);
+ }
+ /**
+ * Get an integer value from a given key.
+ *
+ * @param[in] key The key
+ */
+ version_t get_value(const std::string& key) {
+ return mon.store->get(get_service_name(), key);
+ }
+
+ /**
+ * @}
+ */
+ /**
+ * @}
+ */
+};
+
+#endif
diff --git a/src/mon/Session.h b/src/mon/Session.h
new file mode 100644
index 000000000..e090e7424
--- /dev/null
+++ b/src/mon/Session.h
@@ -0,0 +1,295 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MON_SESSION_H
+#define CEPH_MON_SESSION_H
+
+#include <string>
+#include <string_view>
+
+#include "include/utime.h"
+#include "include/xlist.h"
+
+#include "global/global_context.h"
+#include "msg/msg_types.h"
+#include "mon/mon_types.h"
+
+#include "auth/AuthServiceHandler.h"
+#include "osd/OSDMap.h"
+
+#include "MonCap.h"
+
+struct MonSession;
+
+struct Subscription {
+ MonSession *session;
+ std::string type;
+ xlist<Subscription*>::item type_item;
+ version_t next;
+ bool onetime;
+ bool incremental_onetime; // has CEPH_FEATURE_INCSUBOSDMAP
+
+ Subscription(MonSession *s, const std::string& t) : session(s), type(t), type_item(this),
+ next(0), onetime(false), incremental_onetime(false) {}
+};
+
+struct MonSession : public RefCountedObject {
+ ConnectionRef con;
+ int con_type = 0;
+ uint64_t con_features = 0; // zero if AnonConnection
+ entity_name_t name;
+ entity_addrvec_t addrs;
+ entity_addr_t socket_addr;
+ utime_t session_timeout;
+ bool closed = false;
+ xlist<MonSession*>::item item;
+ std::set<uint64_t> routed_request_tids;
+ MonCap caps;
+ bool validated_stretch_connection = false;
+
+ bool authenticated = false; ///< true if auth handshake is complete
+
+ std::map<std::string, Subscription*> sub_map;
+ epoch_t osd_epoch = 0; ///< the osdmap epoch sent to the mon client
+
+ AuthServiceHandler *auth_handler = nullptr;
+ EntityName entity_name;
+ uint64_t global_id = 0;
+ global_id_status_t global_id_status = global_id_status_t::NONE;
+
+ ConnectionRef proxy_con;
+ uint64_t proxy_tid = 0;
+
+ std::string remote_host; ///< remote host name
+ std::map<std::string,std::string,std::less<>> last_config; ///< most recently shared config
+ bool any_config = false;
+
+ MonSession(Connection *c)
+ : RefCountedObject(g_ceph_context),
+ con(c),
+ item(this) { }
+
+ void _ident(const entity_name_t& n, const entity_addrvec_t& av) {
+ con_type = con->get_peer_type();
+ name = n;
+ addrs = av;
+ socket_addr = con->get_peer_socket_addr();
+ if (con->get_messenger()) {
+ // only fill in features if this is a non-anonymous connection
+ con_features = con->get_features();
+ }
+ }
+
+ ~MonSession() override {
+ //generic_dout(0) << "~MonSession " << this << dendl;
+ // we should have been removed before we get destructed; see MonSessionMap::remove_session()
+ ceph_assert(!item.is_on_list());
+ ceph_assert(sub_map.empty());
+ delete auth_handler;
+ }
+
+ bool is_capable(std::string service, int mask) {
+ std::map<std::string,std::string> args;
+ return caps.is_capable(
+ g_ceph_context,
+ entity_name,
+ service, "", args,
+ mask & MON_CAP_R, mask & MON_CAP_W, mask & MON_CAP_X,
+ get_peer_socket_addr());
+ }
+
+ std::vector<std::string> get_allowed_fs_names() const {
+ return caps.allowed_fs_names();
+ }
+
+ bool fs_name_capable(std::string_view fsname, __u8 mask) {
+ return caps.fs_name_capable(entity_name, fsname, mask);
+ }
+
+ const entity_addr_t& get_peer_socket_addr() {
+ return socket_addr;
+ }
+
+ void dump(ceph::Formatter *f) const {
+ f->dump_stream("name") << name;
+ f->dump_stream("entity_name") << entity_name;
+ f->dump_object("addrs", addrs);
+ f->dump_object("socket_addr", socket_addr);
+ f->dump_string("con_type", ceph_entity_type_name(con_type));
+ f->dump_unsigned("con_features", con_features);
+ f->dump_stream("con_features_hex") << std::hex << con_features << std::dec;
+ f->dump_string("con_features_release",
+ ceph_release_name(ceph_release_from_features(con_features)));
+ f->dump_bool("open", !closed);
+ f->dump_object("caps", caps);
+ f->dump_bool("authenticated", authenticated);
+ f->dump_unsigned("global_id", global_id);
+ f->dump_stream("global_id_status") << global_id_status;
+ f->dump_unsigned("osd_epoch", osd_epoch);
+ f->dump_string("remote_host", remote_host);
+ }
+};
+
+
+struct MonSessionMap {
+ xlist<MonSession*> sessions;
+ std::map<std::string, xlist<Subscription*>* > subs;
+ std::multimap<int, MonSession*> by_osd;
+ FeatureMap feature_map; // type -> features -> count
+
+ MonSessionMap() {}
+ ~MonSessionMap() {
+ while (!subs.empty()) {
+ ceph_assert(subs.begin()->second->empty());
+ delete subs.begin()->second;
+ subs.erase(subs.begin());
+ }
+ }
+
+ unsigned get_size() const {
+ return sessions.size();
+ }
+
+ void remove_session(MonSession *s) {
+ ceph_assert(!s->closed);
+ for (std::map<std::string,Subscription*>::iterator p = s->sub_map.begin(); p != s->sub_map.end(); ++p) {
+ p->second->type_item.remove_myself();
+ delete p->second;
+ }
+ s->sub_map.clear();
+ s->item.remove_myself();
+ if (s->name.is_osd() &&
+ s->name.num() >= 0) {
+ for (auto p = by_osd.find(s->name.num());
+ p->first == s->name.num();
+ ++p)
+ if (p->second == s) {
+ by_osd.erase(p);
+ break;
+ }
+ }
+ if (s->con_features) {
+ feature_map.rm(s->con_type, s->con_features);
+ }
+ s->closed = true;
+ s->put();
+ }
+
+ MonSession *new_session(const entity_name_t& n,
+ const entity_addrvec_t& av,
+ Connection *c) {
+ MonSession *s = new MonSession(c);
+ ceph_assert(s);
+ s->_ident(n, av);
+ add_session(s);
+ return s;
+ }
+
+ void add_session(MonSession *s) {
+ s->session_timeout = ceph_clock_now();
+ s->session_timeout += g_conf()->mon_session_timeout;
+
+ sessions.push_back(&s->item);
+ s->get();
+ if (s->name.is_osd() &&
+ s->name.num() >= 0) {
+ by_osd.insert(std::pair<int,MonSession*>(s->name.num(), s));
+ }
+ if (s->con_features) {
+ feature_map.add(s->con_type, s->con_features);
+ }
+ }
+
+ MonSession *get_random_osd_session(OSDMap *osdmap) {
+ // ok, this isn't actually random, but close enough.
+ if (by_osd.empty())
+ return 0;
+ int n = by_osd.rbegin()->first + 1;
+ int r = rand() % n;
+
+ auto p = by_osd.lower_bound(r);
+ if (p == by_osd.end())
+ --p;
+
+ if (!osdmap) {
+ return p->second;
+ }
+
+ MonSession *s = NULL;
+
+ auto b = p;
+ auto f = p;
+ bool backward = true, forward = true;
+ while (backward || forward) {
+ if (backward) {
+ if (osdmap->is_up(b->first) &&
+ osdmap->get_addrs(b->first) == b->second->con->get_peer_addrs()) {
+ s = b->second;
+ break;
+ }
+ if (b != by_osd.begin())
+ --b;
+ else
+ backward = false;
+ }
+
+ forward = (f != by_osd.end());
+ if (forward) {
+ if (osdmap->is_up(f->first)) {
+ s = f->second;
+ break;
+ }
+ ++f;
+ }
+ }
+
+ return s;
+ }
+
+ void add_update_sub(MonSession *s, const std::string& what, version_t start, bool onetime, bool incremental_onetime) {
+ Subscription *sub = 0;
+ if (s->sub_map.count(what)) {
+ sub = s->sub_map[what];
+ } else {
+ sub = new Subscription(s, what);
+ s->sub_map[what] = sub;
+
+ if (!subs.count(what))
+ subs[what] = new xlist<Subscription*>;
+ subs[what]->push_back(&sub->type_item);
+ }
+ sub->next = start;
+ sub->onetime = onetime;
+ sub->incremental_onetime = onetime && incremental_onetime;
+ }
+
+ void remove_sub(Subscription *sub) {
+ sub->session->sub_map.erase(sub->type);
+ sub->type_item.remove_myself();
+ delete sub;
+ }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const MonSession& s)
+{
+ out << "MonSession(" << s.name << " " << s.addrs
+ << " is " << (s.closed ? "closed" : "open")
+ << " " << s.caps
+ << ", features 0x" << std::hex << s.con_features << std::dec
+ << " (" << ceph_release_name(ceph_release_from_features(s.con_features))
+ << "))";
+ return out;
+}
+
+#endif
diff --git a/src/mon/error_code.cc b/src/mon/error_code.cc
new file mode 100644
index 000000000..a2cd39299
--- /dev/null
+++ b/src/mon/error_code.cc
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <string>
+
+#include "common/error_code.h"
+#include "common/errno.h"
+#include "error_code.h"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
+
+namespace bs = boost::system;
+
+class mon_error_category : public ceph::converting_category {
+public:
+ mon_error_category(){}
+ const char* name() const noexcept override;
+ const char* message(int ev, char*, std::size_t) const noexcept override;
+ std::string message(int ev) const override;
+ bs::error_condition default_error_condition(int ev) const noexcept
+ override;
+ bool equivalent(int ev, const bs::error_condition& c) const
+ noexcept override;
+ using ceph::converting_category::equivalent;
+ int from_code(int ev) const noexcept override;
+};
+
+const char* mon_error_category::name() const noexcept {
+ return "mon";
+}
+
+const char* mon_error_category::message(int ev, char* buf,
+ std::size_t len) const noexcept {
+ if (ev == 0)
+ return "No error";
+
+ if (len) {
+ auto s = cpp_strerror(ev);
+ auto n = s.copy(buf, len - 1);
+ *(buf + n) = '\0';
+ }
+ return buf;
+}
+
+std::string mon_error_category::message(int ev) const {
+ if (ev == 0)
+ return "No error";
+
+ return cpp_strerror(ev);
+}
+
+bs::error_condition
+mon_error_category::default_error_condition(int ev) const noexcept {
+ return { ev, bs::generic_category() };
+}
+
+bool mon_error_category::equivalent(int ev,const bs::error_condition& c) const noexcept {
+ return default_error_condition(ev) == c;
+}
+
+int mon_error_category::from_code(int ev) const noexcept {
+ return -ev;
+}
+
+const bs::error_category& mon_category() noexcept {
+ static const mon_error_category c;
+ return c;
+}
+#pragma GCC diagnostic pop
+#pragma clang diagnostic pop
diff --git a/src/mon/error_code.h b/src/mon/error_code.h
new file mode 100644
index 000000000..2a6e88061
--- /dev/null
+++ b/src/mon/error_code.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <boost/system/error_code.hpp>
+
+#include "include/rados.h"
+
+const boost::system::error_category& mon_category() noexcept;
+
+// The Monitor, like the OSD, mostly replies with POSIX error codes.
+
+enum class mon_errc {
+};
+
+namespace boost::system {
+template<>
+struct is_error_code_enum<::mon_errc> {
+ static const bool value = true;
+};
+
+template<>
+struct is_error_condition_enum<::mon_errc> {
+ static const bool value = false;
+};
+}
+
+// explicit conversion:
+inline boost::system::error_code make_error_code(mon_errc e) noexcept {
+ return { static_cast<int>(e), mon_category() };
+}
+
+// implicit conversion:
+inline boost::system::error_condition make_error_condition(mon_errc e) noexcept {
+ return { static_cast<int>(e), mon_category() };
+}
diff --git a/src/mon/health_check.h b/src/mon/health_check.h
new file mode 100644
index 000000000..e6d1d7595
--- /dev/null
+++ b/src/mon/health_check.h
@@ -0,0 +1,199 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <map>
+
+#include "include/health.h"
+#include "include/utime.h"
+#include "common/Formatter.h"
+
+struct health_check_t {
+ health_status_t severity;
+ std::string summary;
+ std::list<std::string> detail;
+ int64_t count = 0;
+
+ DENC(health_check_t, v, p) {
+ DENC_START(2, 1, p);
+ denc(v.severity, p);
+ denc(v.summary, p);
+ denc(v.detail, p);
+ if (struct_v >= 2) {
+ denc(v.count, p);
+ }
+ DENC_FINISH(p);
+ }
+
+ friend bool operator==(const health_check_t& l,
+ const health_check_t& r) {
+ return l.severity == r.severity &&
+ l.summary == r.summary &&
+ l.detail == r.detail &&
+ l.count == r.count;
+ }
+ friend bool operator!=(const health_check_t& l,
+ const health_check_t& r) {
+ return !(l == r);
+ }
+
+ void dump(ceph::Formatter *f, bool want_detail=true) const {
+ f->dump_stream("severity") << severity;
+
+ f->open_object_section("summary");
+ f->dump_string("message", summary);
+ f->dump_int("count", count);
+ f->close_section();
+
+ if (want_detail) {
+ f->open_array_section("detail");
+ for (auto& p : detail) {
+ f->open_object_section("detail_item");
+ f->dump_string("message", p);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ }
+
+ static void generate_test_instances(std::list<health_check_t*>& ls) {
+ ls.push_back(new health_check_t);
+ ls.back()->severity = HEALTH_WARN;
+ ls.push_back(new health_check_t);
+ ls.back()->severity = HEALTH_ERR;
+ ls.back()->summary = "summarization";
+ ls.back()->detail = {"one", "two", "three"};
+ ls.back()->count = 42;
+ }
+};
+WRITE_CLASS_DENC(health_check_t)
+
+
+struct health_mute_t {
+ std::string code;
+ utime_t ttl;
+ bool sticky = false;
+ std::string summary;
+ int64_t count;
+
+ DENC(health_mute_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.code, p);
+ denc(v.ttl, p);
+ denc(v.sticky, p);
+ denc(v.summary, p);
+ denc(v.count, p);
+ DENC_FINISH(p);
+ }
+
+ void dump(ceph::Formatter *f) const {
+ f->dump_string("code", code);
+ if (ttl != utime_t()) {
+ f->dump_stream("ttl") << ttl;
+ }
+ f->dump_bool("sticky", sticky);
+ f->dump_string("summary", summary);
+ f->dump_int("count", count);
+ }
+
+ static void generate_test_instances(std::list<health_mute_t*>& ls) {
+ ls.push_back(new health_mute_t);
+ ls.push_back(new health_mute_t);
+ ls.back()->code = "OSD_DOWN";
+ ls.back()->ttl = utime_t(1, 2);
+ ls.back()->sticky = true;
+ ls.back()->summary = "foo bar";
+ ls.back()->count = 2;
+ }
+};
+WRITE_CLASS_DENC(health_mute_t)
+
+struct health_check_map_t {
+ std::map<std::string,health_check_t> checks;
+
+ DENC(health_check_map_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.checks, p);
+ DENC_FINISH(p);
+ }
+
+ void dump(ceph::Formatter *f) const {
+ for (auto& [code, check] : checks) {
+ f->dump_object(code, check);
+ }
+ }
+
+ static void generate_test_instances(std::list<health_check_map_t*>& ls) {
+ ls.push_back(new health_check_map_t);
+ ls.push_back(new health_check_map_t);
+ {
+ auto& d = ls.back()->add("FOO", HEALTH_WARN, "foo", 2);
+ d.detail.push_back("a");
+ d.detail.push_back("b");
+ }
+ {
+ auto& d = ls.back()->add("BAR", HEALTH_ERR, "bar!", 3);
+ d.detail.push_back("c");
+ d.detail.push_back("d");
+ d.detail.push_back("e");
+ }
+ }
+
+ void clear() {
+ checks.clear();
+ }
+ bool empty() const {
+ return checks.empty();
+ }
+ void swap(health_check_map_t& other) {
+ checks.swap(other.checks);
+ }
+
+ health_check_t& add(const std::string& code,
+ health_status_t severity,
+ const std::string& summary,
+ int64_t count) {
+ ceph_assert(checks.count(code) == 0);
+ health_check_t& r = checks[code];
+ r.severity = severity;
+ r.summary = summary;
+ r.count = count;
+ return r;
+ }
+ health_check_t& get_or_add(const std::string& code,
+ health_status_t severity,
+ const std::string& summary,
+ int64_t count) {
+ health_check_t& r = checks[code];
+ r.severity = severity;
+ r.summary = summary;
+ r.count += count;
+ return r;
+ }
+
+ void merge(const health_check_map_t& o) {
+ for (auto& [code, check] : o.checks) {
+ auto [it, new_check] = checks.try_emplace(code, check);
+ if (!new_check) {
+ // merge details, and hope the summary matches!
+ it->second.detail.insert(
+ it->second.detail.end(),
+ check.detail.begin(),
+ check.detail.end());
+ it->second.count += check.count;
+ }
+ }
+ }
+
+ friend bool operator==(const health_check_map_t& l,
+ const health_check_map_t& r) {
+ return l.checks == r.checks;
+ }
+ friend bool operator!=(const health_check_map_t& l,
+ const health_check_map_t& r) {
+ return !(l == r);
+ }
+};
+WRITE_CLASS_DENC(health_check_map_t)
diff --git a/src/mon/mon_types.h b/src/mon/mon_types.h
new file mode 100644
index 000000000..3429a8e99
--- /dev/null
+++ b/src/mon/mon_types.h
@@ -0,0 +1,723 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MON_TYPES_H
+#define CEPH_MON_TYPES_H
+
+#include <map>
+
+#include "include/Context.h"
+#include "include/util.h"
+#include "include/utime.h"
+#include "common/Formatter.h"
+#include "common/bit_str.h"
+#include "common/ceph_releases.h"
+
+// use as paxos_service index
+enum {
+ PAXOS_MDSMAP,
+ PAXOS_OSDMAP,
+ PAXOS_LOG,
+ PAXOS_MONMAP,
+ PAXOS_AUTH,
+ PAXOS_MGR,
+ PAXOS_MGRSTAT,
+ PAXOS_HEALTH,
+ PAXOS_CONFIG,
+ PAXOS_KV,
+ PAXOS_NUM
+};
+
+#define CEPH_MON_ONDISK_MAGIC "ceph mon volume v012"
+
+// map of entity_type -> features -> count
+struct FeatureMap {
+ std::map<uint32_t,std::map<uint64_t,uint64_t>> m;
+
+ void add(uint32_t type, uint64_t features) {
+ if (type == CEPH_ENTITY_TYPE_MON) {
+ return;
+ }
+ m[type][features]++;
+ }
+
+ void add_mon(uint64_t features) {
+ m[CEPH_ENTITY_TYPE_MON][features]++;
+ }
+
+ void rm(uint32_t type, uint64_t features) {
+ if (type == CEPH_ENTITY_TYPE_MON) {
+ return;
+ }
+ auto p = m.find(type);
+ ceph_assert(p != m.end());
+ auto q = p->second.find(features);
+ ceph_assert(q != p->second.end());
+ if (--q->second == 0) {
+ p->second.erase(q);
+ if (p->second.empty()) {
+ m.erase(p);
+ }
+ }
+ }
+
+ FeatureMap& operator+=(const FeatureMap& o) {
+ for (auto& p : o.m) {
+ auto &v = m[p.first];
+ for (auto& q : p.second) {
+ v[q.first] += q.second;
+ }
+ }
+ return *this;
+ }
+
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(m, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(ceph::buffer::list::const_iterator& p) {
+ DECODE_START(1, p);
+ decode(m, p);
+ DECODE_FINISH(p);
+ }
+
+ void dump(ceph::Formatter *f) const {
+ for (auto& p : m) {
+ f->open_array_section(ceph_entity_type_name(p.first));
+ for (auto& q : p.second) {
+ f->open_object_section("group");
+ std::stringstream ss;
+ ss << "0x" << std::hex << q.first << std::dec;
+ f->dump_string("features", ss.str());
+ f->dump_string("release", ceph_release_name(
+ ceph_release_from_features(q.first)));
+ f->dump_unsigned("num", q.second);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ }
+
+ static void generate_test_instances(std::list<FeatureMap*>& ls) {
+ ls.push_back(new FeatureMap);
+ ls.push_back(new FeatureMap);
+ ls.back()->add(CEPH_ENTITY_TYPE_OSD, CEPH_FEATURE_UID);
+ ls.back()->add(CEPH_ENTITY_TYPE_OSD, CEPH_FEATURE_NOSRCADDR);
+ ls.back()->add(CEPH_ENTITY_TYPE_OSD, CEPH_FEATURE_PGID64);
+ ls.back()->add(CEPH_ENTITY_TYPE_OSD, CEPH_FEATURE_INCSUBOSDMAP);
+ }
+};
+WRITE_CLASS_ENCODER(FeatureMap)
+
+/**
+ * monitor db store stats
+ */
+struct MonitorDBStoreStats {
+ uint64_t bytes_total;
+ uint64_t bytes_sst;
+ uint64_t bytes_log;
+ uint64_t bytes_misc;
+ utime_t last_update;
+
+ MonitorDBStoreStats() :
+ bytes_total(0),
+ bytes_sst(0),
+ bytes_log(0),
+ bytes_misc(0)
+ {}
+
+ void dump(ceph::Formatter *f) const {
+ ceph_assert(f != NULL);
+ f->dump_int("bytes_total", bytes_total);
+ f->dump_int("bytes_sst", bytes_sst);
+ f->dump_int("bytes_log", bytes_log);
+ f->dump_int("bytes_misc", bytes_misc);
+ f->dump_stream("last_updated") << last_update;
+ }
+
+ void encode(ceph::buffer::list &bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(bytes_total, bl);
+ encode(bytes_sst, bl);
+ encode(bytes_log, bl);
+ encode(bytes_misc, bl);
+ encode(last_update, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(ceph::buffer::list::const_iterator &p) {
+ DECODE_START(1, p);
+ decode(bytes_total, p);
+ decode(bytes_sst, p);
+ decode(bytes_log, p);
+ decode(bytes_misc, p);
+ decode(last_update, p);
+ DECODE_FINISH(p);
+ }
+
+ static void generate_test_instances(std::list<MonitorDBStoreStats*>& ls) {
+ ls.push_back(new MonitorDBStoreStats);
+ ls.push_back(new MonitorDBStoreStats);
+ ls.back()->bytes_total = 1024*1024;
+ ls.back()->bytes_sst = 512*1024;
+ ls.back()->bytes_log = 256*1024;
+ ls.back()->bytes_misc = 256*1024;
+ ls.back()->last_update = utime_t();
+ }
+};
+WRITE_CLASS_ENCODER(MonitorDBStoreStats)
+
+// data stats
+
+struct DataStats {
+ ceph_data_stats_t fs_stats;
+ // data dir
+ utime_t last_update;
+ MonitorDBStoreStats store_stats;
+
+ void dump(ceph::Formatter *f) const {
+ ceph_assert(f != NULL);
+ f->dump_int("kb_total", (fs_stats.byte_total/1024));
+ f->dump_int("kb_used", (fs_stats.byte_used/1024));
+ f->dump_int("kb_avail", (fs_stats.byte_avail/1024));
+ f->dump_int("avail_percent", fs_stats.avail_percent);
+ f->dump_stream("last_updated") << last_update;
+ f->open_object_section("store_stats");
+ store_stats.dump(f);
+ f->close_section();
+ }
+ static void generate_test_instances(std::list<DataStats*>& ls) {
+ ls.push_back(new DataStats);
+ ls.push_back(new DataStats);
+ ls.back()->fs_stats.byte_total = 1024*1024;
+ ls.back()->fs_stats.byte_used = 512*1024;
+ ls.back()->fs_stats.byte_avail = 256*1024;
+ ls.back()->fs_stats.avail_percent = 50;
+ ls.back()->last_update = utime_t();
+ ls.back()->store_stats.bytes_total = 1024*1024;
+ ls.back()->store_stats.bytes_sst = 512*1024;
+ ls.back()->store_stats.bytes_log = 256*1024;
+ ls.back()->store_stats.bytes_misc = 256*1024;
+ ls.back()->store_stats.last_update = utime_t();
+ }
+
+ void encode(ceph::buffer::list &bl) const {
+ ENCODE_START(3, 1, bl);
+ encode(fs_stats.byte_total, bl);
+ encode(fs_stats.byte_used, bl);
+ encode(fs_stats.byte_avail, bl);
+ encode(fs_stats.avail_percent, bl);
+ encode(last_update, bl);
+ encode(store_stats, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator &p) {
+ DECODE_START(1, p);
+ // we moved from having fields in kb to fields in byte
+ if (struct_v > 2) {
+ decode(fs_stats.byte_total, p);
+ decode(fs_stats.byte_used, p);
+ decode(fs_stats.byte_avail, p);
+ } else {
+ uint64_t t;
+ decode(t, p);
+ fs_stats.byte_total = t*1024;
+ decode(t, p);
+ fs_stats.byte_used = t*1024;
+ decode(t, p);
+ fs_stats.byte_avail = t*1024;
+ }
+ decode(fs_stats.avail_percent, p);
+ decode(last_update, p);
+ if (struct_v > 1)
+ decode(store_stats, p);
+
+ DECODE_FINISH(p);
+ }
+};
+WRITE_CLASS_ENCODER(DataStats)
+
+struct ScrubResult {
+ std::map<std::string,uint32_t> prefix_crc; ///< prefix -> crc
+ std::map<std::string,uint64_t> prefix_keys; ///< prefix -> key count
+
+ bool operator!=(const ScrubResult& other) {
+ return prefix_crc != other.prefix_crc || prefix_keys != other.prefix_keys;
+ }
+
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(prefix_crc, bl);
+ encode(prefix_keys, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ DECODE_START(1, p);
+ decode(prefix_crc, p);
+ decode(prefix_keys, p);
+ DECODE_FINISH(p);
+ }
+ void dump(ceph::Formatter *f) const {
+ f->open_object_section("crc");
+ for (auto p = prefix_crc.begin(); p != prefix_crc.end(); ++p)
+ f->dump_unsigned(p->first.c_str(), p->second);
+ f->close_section();
+ f->open_object_section("keys");
+ for (auto p = prefix_keys.begin(); p != prefix_keys.end(); ++p)
+ f->dump_unsigned(p->first.c_str(), p->second);
+ f->close_section();
+ }
+ static void generate_test_instances(std::list<ScrubResult*>& ls) {
+ ls.push_back(new ScrubResult);
+ ls.push_back(new ScrubResult);
+ ls.back()->prefix_crc["foo"] = 123;
+ ls.back()->prefix_keys["bar"] = 456;
+ }
+};
+WRITE_CLASS_ENCODER(ScrubResult)
+
+inline std::ostream& operator<<(std::ostream& out, const ScrubResult& r) {
+ return out << "ScrubResult(keys " << r.prefix_keys << " crc " << r.prefix_crc << ")";
+}
+
+/// for information like os, kernel, hostname, memory info, cpu model.
+typedef std::map<std::string, std::string> Metadata;
+
+namespace ceph {
+ namespace features {
+ namespace mon {
+ /**
+ * Get a feature's name based on its value.
+ *
+ * @param b raw feature value
+ *
+ * @remarks
+ * Consumers should not assume this interface will never change.
+ * @remarks
+ * As the number of features increase, so may the internal representation
+ * of the raw features. When this happens, this interface will change
+ * accordingly. So should consumers of this interface.
+ */
+ static inline const char *get_feature_name(uint64_t b);
+ }
+ }
+}
+
+
+inline const char *ceph_mon_feature_name(uint64_t b)
+{
+ return ceph::features::mon::get_feature_name(b);
+};
+
+class mon_feature_t {
+
+ static constexpr int HEAD_VERSION = 1;
+ static constexpr int COMPAT_VERSION = 1;
+
+ // mon-specific features
+ uint64_t features;
+
+public:
+
+ explicit constexpr
+ mon_feature_t(const uint64_t f) : features(f) { }
+
+ mon_feature_t() :
+ features(0) { }
+
+ constexpr
+ mon_feature_t(const mon_feature_t &o) :
+ features(o.features) { }
+
+ mon_feature_t& operator&=(const mon_feature_t other) {
+ features &= other.features;
+ return (*this);
+ }
+
+ /**
+ * Obtain raw features
+ *
+ * @remarks
+ * Consumers should not assume this interface will never change.
+ * @remarks
+ * As the number of features increase, so may the internal representation
+ * of the raw features. When this happens, this interface will change
+ * accordingly. So should consumers of this interface.
+ */
+ uint64_t get_raw() const {
+ return features;
+ }
+
+ constexpr
+ friend mon_feature_t operator&(const mon_feature_t a,
+ const mon_feature_t b) {
+ return mon_feature_t(a.features & b.features);
+ }
+
+ mon_feature_t& operator|=(const mon_feature_t other) {
+ features |= other.features;
+ return (*this);
+ }
+
+ constexpr
+ friend mon_feature_t operator|(const mon_feature_t a,
+ const mon_feature_t b) {
+ return mon_feature_t(a.features | b.features);
+ }
+
+ constexpr
+ friend mon_feature_t operator^(const mon_feature_t a,
+ const mon_feature_t b) {
+ return mon_feature_t(a.features ^ b.features);
+ }
+
+ mon_feature_t& operator^=(const mon_feature_t other) {
+ features ^= other.features;
+ return (*this);
+ }
+
+ bool operator==(const mon_feature_t other) const {
+ return (features == other.features);
+ }
+
+ bool operator!=(const mon_feature_t other) const {
+ return (features != other.features);
+ }
+
+ bool empty() const {
+ return features == 0;
+ }
+
+ /**
+ * Set difference of our features in respect to @p other
+ *
+ * Returns all the elements in our features that are not in @p other
+ *
+ * @returns all the features not in @p other
+ */
+ mon_feature_t diff(const mon_feature_t other) const {
+ return mon_feature_t((features ^ other.features) & features);
+ }
+
+ /**
+ * Set intersection of our features and @p other
+ *
+ * Returns all the elements common to both our features and the
+ * features of @p other
+ *
+ * @returns the features common to @p other and us
+ */
+ mon_feature_t intersection(const mon_feature_t other) const {
+ return mon_feature_t((features & other.features));
+ }
+
+ /**
+ * Checks whether we have all the features in @p other
+ *
+ * Returns true if we have all the features in @p other
+ *
+ * @returns true if we contain all the features in @p other
+ * @returns false if we do not contain some of the features in @p other
+ */
+ bool contains_all(const mon_feature_t other) const {
+ mon_feature_t d = intersection(other);
+ return d == other;
+ }
+
+ /**
+ * Checks whether we contain any of the features in @p other.
+ *
+ * @returns true if we contain any of the features in @p other
+ * @returns false if we don't contain any of the features in @p other
+ */
+ bool contains_any(const mon_feature_t other) const {
+ mon_feature_t d = intersection(other);
+ return !d.empty();
+ }
+
+ void set_feature(const mon_feature_t f) {
+ features |= f.features;
+ }
+
+ void unset_feature(const mon_feature_t f) {
+ features &= ~(f.features);
+ }
+
+ void print(std::ostream& out) const {
+ out << "[";
+ print_bit_str(features, out, ceph::features::mon::get_feature_name);
+ out << "]";
+ }
+
+ void print_with_value(std::ostream& out) const {
+ out << "[";
+ print_bit_str(features, out, ceph::features::mon::get_feature_name, true);
+ out << "]";
+ }
+
+ void dump(ceph::Formatter *f, const char *sec_name = NULL) const {
+ f->open_array_section((sec_name ? sec_name : "features"));
+ dump_bit_str(features, f, ceph::features::mon::get_feature_name);
+ f->close_section();
+ }
+
+ void dump_with_value(ceph::Formatter *f, const char *sec_name = NULL) const {
+ f->open_array_section((sec_name ? sec_name : "features"));
+ dump_bit_str(features, f, ceph::features::mon::get_feature_name, true);
+ f->close_section();
+ }
+
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(HEAD_VERSION, COMPAT_VERSION, bl);
+ encode(features, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ DECODE_START(COMPAT_VERSION, p);
+ decode(features, p);
+ DECODE_FINISH(p);
+ }
+
+ static void generate_test_instances(std::list<mon_feature_t*>& ls) {
+ ls.push_back(new mon_feature_t);
+ ls.push_back(new mon_feature_t);
+ ls.back()->features = 1;
+ ls.push_back(new mon_feature_t);
+ ls.back()->features = 2;
+ }
+};
+WRITE_CLASS_ENCODER(mon_feature_t)
+
+namespace ceph {
+ namespace features {
+ namespace mon {
+ constexpr mon_feature_t FEATURE_KRAKEN( (1ULL << 0));
+ constexpr mon_feature_t FEATURE_LUMINOUS( (1ULL << 1));
+ constexpr mon_feature_t FEATURE_MIMIC( (1ULL << 2));
+ constexpr mon_feature_t FEATURE_OSDMAP_PRUNE (1ULL << 3);
+ constexpr mon_feature_t FEATURE_NAUTILUS( (1ULL << 4));
+ constexpr mon_feature_t FEATURE_OCTOPUS( (1ULL << 5));
+ constexpr mon_feature_t FEATURE_PACIFIC( (1ULL << 6));
+ // elector pinging and CONNECTIVITY mode:
+ constexpr mon_feature_t FEATURE_PINGING( (1ULL << 7));
+ constexpr mon_feature_t FEATURE_QUINCY( (1ULL << 8));
+ constexpr mon_feature_t FEATURE_REEF( (1ULL << 9));
+ constexpr mon_feature_t FEATURE_SQUID( (1ULL << 10));
+
+ constexpr mon_feature_t FEATURE_RESERVED( (1ULL << 63));
+ constexpr mon_feature_t FEATURE_NONE( (0ULL));
+
+ /**
+ * All the features this monitor supports
+ *
+ * If there's a feature above, it should be OR'ed to this list.
+ */
+ constexpr mon_feature_t get_supported() {
+ return (
+ FEATURE_KRAKEN |
+ FEATURE_LUMINOUS |
+ FEATURE_MIMIC |
+ FEATURE_OSDMAP_PRUNE |
+ FEATURE_NAUTILUS |
+ FEATURE_OCTOPUS |
+ FEATURE_PACIFIC |
+ FEATURE_PINGING |
+ FEATURE_QUINCY |
+ FEATURE_REEF |
+ FEATURE_SQUID |
+ FEATURE_NONE
+ );
+ }
+ /**
+ * All the features that, once set, cannot be removed.
+ *
+ * Features should only be added to this list if you want to make
+ * sure downgrades are not possible after a quorum supporting all
+ * these features has been formed.
+ *
+ * Any feature in this list will be automatically set on the monmap's
+ * features once all the monitors in the quorum support it.
+ */
+ constexpr mon_feature_t get_persistent() {
+ return (
+ FEATURE_KRAKEN |
+ FEATURE_LUMINOUS |
+ FEATURE_MIMIC |
+ FEATURE_NAUTILUS |
+ FEATURE_OSDMAP_PRUNE |
+ FEATURE_OCTOPUS |
+ FEATURE_PACIFIC |
+ FEATURE_PINGING |
+ FEATURE_QUINCY |
+ FEATURE_REEF |
+ FEATURE_SQUID |
+ FEATURE_NONE
+ );
+ }
+
+ constexpr mon_feature_t get_optional() {
+ return (
+ FEATURE_OSDMAP_PRUNE |
+ FEATURE_NONE
+ );
+ }
+
+ static inline mon_feature_t get_feature_by_name(const std::string &n);
+ }
+ }
+}
+
+static inline ceph_release_t infer_ceph_release_from_mon_features(mon_feature_t f)
+{
+ if (f.contains_all(ceph::features::mon::FEATURE_SQUID)) {
+ return ceph_release_t::squid;
+ }
+ if (f.contains_all(ceph::features::mon::FEATURE_REEF)) {
+ return ceph_release_t::reef;
+ }
+ if (f.contains_all(ceph::features::mon::FEATURE_QUINCY)) {
+ return ceph_release_t::quincy;
+ }
+ if (f.contains_all(ceph::features::mon::FEATURE_PACIFIC)) {
+ return ceph_release_t::pacific;
+ }
+ if (f.contains_all(ceph::features::mon::FEATURE_OCTOPUS)) {
+ return ceph_release_t::octopus;
+ }
+ if (f.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) {
+ return ceph_release_t::nautilus;
+ }
+ if (f.contains_all(ceph::features::mon::FEATURE_MIMIC)) {
+ return ceph_release_t::mimic;
+ }
+ if (f.contains_all(ceph::features::mon::FEATURE_LUMINOUS)) {
+ return ceph_release_t::luminous;
+ }
+ if (f.contains_all(ceph::features::mon::FEATURE_KRAKEN)) {
+ return ceph_release_t::kraken;
+ }
+ return ceph_release_t::unknown;
+}
+
+static inline const char *ceph::features::mon::get_feature_name(uint64_t b) {
+ mon_feature_t f(b);
+
+ if (f == FEATURE_KRAKEN) {
+ return "kraken";
+ } else if (f == FEATURE_LUMINOUS) {
+ return "luminous";
+ } else if (f == FEATURE_MIMIC) {
+ return "mimic";
+ } else if (f == FEATURE_OSDMAP_PRUNE) {
+ return "osdmap-prune";
+ } else if (f == FEATURE_NAUTILUS) {
+ return "nautilus";
+ } else if (f == FEATURE_PINGING) {
+ return "elector-pinging";
+ } else if (f == FEATURE_OCTOPUS) {
+ return "octopus";
+ } else if (f == FEATURE_PACIFIC) {
+ return "pacific";
+ } else if (f == FEATURE_QUINCY) {
+ return "quincy";
+ } else if (f == FEATURE_REEF) {
+ return "reef";
+ } else if (f == FEATURE_SQUID) {
+ return "squid";
+ } else if (f == FEATURE_RESERVED) {
+ return "reserved";
+ }
+ return "unknown";
+}
+
+inline mon_feature_t ceph::features::mon::get_feature_by_name(const std::string &n) {
+
+ if (n == "kraken") {
+ return FEATURE_KRAKEN;
+ } else if (n == "luminous") {
+ return FEATURE_LUMINOUS;
+ } else if (n == "mimic") {
+ return FEATURE_MIMIC;
+ } else if (n == "osdmap-prune") {
+ return FEATURE_OSDMAP_PRUNE;
+ } else if (n == "nautilus") {
+ return FEATURE_NAUTILUS;
+ } else if (n == "feature-pinging") {
+ return FEATURE_PINGING;
+ } else if (n == "octopus") {
+ return FEATURE_OCTOPUS;
+ } else if (n == "pacific") {
+ return FEATURE_PACIFIC;
+ } else if (n == "quincy") {
+ return FEATURE_QUINCY;
+ } else if (n == "reef") {
+ return FEATURE_REEF;
+ } else if (n == "squid") {
+ return FEATURE_SQUID;
+ } else if (n == "reserved") {
+ return FEATURE_RESERVED;
+ }
+ return FEATURE_NONE;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const mon_feature_t& f) {
+ out << "mon_feature_t(";
+ f.print(out);
+ out << ")";
+ return out;
+}
+
+
+struct ProgressEvent {
+ std::string message; ///< event description
+ float progress = 0.0f; ///< [0..1]
+ bool add_to_ceph_s = false;
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(2, 1, bl);
+ encode(message, bl);
+ encode(progress, bl);
+ encode(add_to_ceph_s, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ DECODE_START(2, p);
+ decode(message, p);
+ decode(progress, p);
+ if (struct_v >= 2){
+ decode(add_to_ceph_s, p);
+ } else {
+ if (!message.empty()) {
+ add_to_ceph_s = true;
+ }
+ }
+ DECODE_FINISH(p);
+ }
+ void dump(ceph::Formatter *f) const {
+ f->dump_string("message", message);
+ f->dump_float("progress", progress);
+ f->dump_bool("add_to_ceph_s", add_to_ceph_s);
+ }
+ static void generate_test_instances(std::list<ProgressEvent*>& o) {
+ o.push_back(new ProgressEvent);
+ o.push_back(new ProgressEvent);
+ o.back()->message = "test message";
+ o.back()->progress = 0.5;
+ o.back()->add_to_ceph_s = true;
+ }
+};
+WRITE_CLASS_ENCODER(ProgressEvent)
+
+#endif