// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system * * Copyright (C) 2004-2006 Sage Weil * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #ifndef CEPH_MDSMAP_H #define CEPH_MDSMAP_H #include #include #include #include #include #include #include "include/types.h" #include "common/Clock.h" #include "include/health.h" #include "common/config.h" #include "include/CompatSet.h" #include "include/ceph_features.h" #include "common/Formatter.h" #include "mds/mdstypes.h" class CephContext; class health_check_map_t; #define MDS_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "base v0.20") #define MDS_FEATURE_INCOMPAT_CLIENTRANGES CompatSet::Feature(2, "client writeable ranges") #define MDS_FEATURE_INCOMPAT_FILELAYOUT CompatSet::Feature(3, "default file layouts on dirs") #define MDS_FEATURE_INCOMPAT_DIRINODE CompatSet::Feature(4, "dir inode in separate object") #define MDS_FEATURE_INCOMPAT_ENCODING CompatSet::Feature(5, "mds uses versioned encoding") #define MDS_FEATURE_INCOMPAT_OMAPDIRFRAG CompatSet::Feature(6, "dirfrag is stored in omap") #define MDS_FEATURE_INCOMPAT_INLINE CompatSet::Feature(7, "mds uses inline data") #define MDS_FEATURE_INCOMPAT_NOANCHOR CompatSet::Feature(8, "no anchor table") #define MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2 CompatSet::Feature(9, "file layout v2") #define MDS_FEATURE_INCOMPAT_SNAPREALM_V2 CompatSet::Feature(10, "snaprealm v2") #define MDS_FS_NAME_DEFAULT "cephfs" class MDSMap { public: /* These states are the union of the set of possible states of an MDS daemon, * and the set of possible states of an MDS rank. See * doc/cephfs/mds-states.rst for state descriptions, * doc/cephfs/mds-state-diagram.svg for a visual state diagram, and * doc/cephfs/mds-state-diagram.dot to update mds-state-diagram.svg. */ typedef enum { // States of an MDS daemon not currently holding a rank // ==================================================== STATE_NULL = CEPH_MDS_STATE_NULL, // null value for fns returning this type. STATE_BOOT = CEPH_MDS_STATE_BOOT, // up, boot announcement. destiny unknown. STATE_STANDBY = CEPH_MDS_STATE_STANDBY, // up, idle. waiting for assignment by monitor. STATE_STANDBY_REPLAY = CEPH_MDS_STATE_STANDBY_REPLAY, // up, replaying active node, ready to take over. // States of an MDS rank, and of any MDS daemon holding that rank // ============================================================== STATE_STOPPED = CEPH_MDS_STATE_STOPPED, // down, once existed, but no subtrees. empty log. may not be held by a daemon. STATE_CREATING = CEPH_MDS_STATE_CREATING, // up, creating MDS instance (new journal, idalloc..). STATE_STARTING = CEPH_MDS_STATE_STARTING, // up, starting prior stopped MDS instance. STATE_REPLAY = CEPH_MDS_STATE_REPLAY, // up, starting prior failed instance. scanning journal. STATE_RESOLVE = CEPH_MDS_STATE_RESOLVE, // up, disambiguating distributed operations (import, rename, etc.) STATE_RECONNECT = CEPH_MDS_STATE_RECONNECT, // up, reconnect to clients STATE_REJOIN = CEPH_MDS_STATE_REJOIN, // up, replayed journal, rejoining distributed cache STATE_CLIENTREPLAY = CEPH_MDS_STATE_CLIENTREPLAY, // up, active STATE_ACTIVE = CEPH_MDS_STATE_ACTIVE, // up, active STATE_STOPPING = CEPH_MDS_STATE_STOPPING, // up, exporting metadata (-> standby or out) STATE_DNE = CEPH_MDS_STATE_DNE, // down, rank does not exist // State which a daemon may send to MDSMonitor in its beacon // to indicate that offline repair is required. Daemon must stop // immediately after indicating this state. STATE_DAMAGED = CEPH_MDS_STATE_DAMAGED /* * In addition to explicit states, an MDS rank implicitly in state: * - STOPPED if it is not currently associated with an MDS daemon gid but it * is in MDSMap::stopped * - FAILED if it is not currently associated with an MDS daemon gid but it * is in MDSMap::failed * - DNE if it is not currently associated with an MDS daemon gid and it is * missing from both MDSMap::failed and MDSMap::stopped */ } DaemonState; struct mds_info_t { mds_gid_t global_id; std::string name; mds_rank_t rank; int32_t inc; MDSMap::DaemonState state; version_t state_seq; entity_addrvec_t addrs; utime_t laggy_since; mds_rank_t standby_for_rank; std::string standby_for_name; fs_cluster_id_t standby_for_fscid; bool standby_replay; std::set export_targets; uint64_t mds_features = 0; mds_info_t() : global_id(MDS_GID_NONE), rank(MDS_RANK_NONE), inc(0), state(STATE_STANDBY), state_seq(0), standby_for_rank(MDS_RANK_NONE), standby_for_fscid(FS_CLUSTER_ID_NONE), standby_replay(false) { } bool laggy() const { return !(laggy_since == utime_t()); } void clear_laggy() { laggy_since = utime_t(); } entity_addrvec_t get_addrs() const { return addrs; } void encode(bufferlist& bl, uint64_t features) const { if ((features & CEPH_FEATURE_MDSENC) == 0 ) encode_unversioned(bl); else encode_versioned(bl, features); } void decode(bufferlist::const_iterator& p); void dump(Formatter *f) const; void print_summary(ostream &out) const; // The long form name for use in cluster log messages` std::string human_name() const; static void generate_test_instances(list& ls); private: void encode_versioned(bufferlist& bl, uint64_t features) const; void encode_unversioned(bufferlist& bl) const; }; static CompatSet get_compat_set_all(); static CompatSet get_compat_set_default(); static CompatSet get_compat_set_base(); // pre v0.20 protected: // base map epoch_t epoch = 0; bool enabled = false; std::string fs_name = MDS_FS_NAME_DEFAULT; uint32_t flags = CEPH_MDSMAP_DEFAULTS; // flags epoch_t last_failure = 0; // mds epoch of last failure epoch_t last_failure_osd_epoch = 0; // osd epoch of last failure; any mds entering replay needs // at least this osdmap to ensure the blacklist propagates. utime_t created; utime_t modified; mds_rank_t tableserver = 0; // which MDS has snaptable mds_rank_t root = 0; // which MDS has root directory __u32 session_timeout = 60; __u32 session_autoclose = 300; uint64_t max_file_size = 1ULL<<40; /* 1TB */ int8_t min_compat_client = -1; std::vector data_pools; // file data pools available to clients (via an ioctl). first is the default. int64_t cas_pool = -1; // where CAS objects go int64_t metadata_pool = -1; // where fs metadata objects go /* * in: the set of logical mds #'s that define the cluster. this is the set * of mds's the metadata may be distributed over. * up: map from logical mds #'s to the addrs filling those roles. * failed: subset of @in that are failed. * stopped: set of nodes that have been initialized, but are not active. * * @up + @failed = @in. @in * @stopped = {}. */ mds_rank_t max_mds = 1; /* The maximum number of active MDSes. Also, the maximum rank. */ mds_rank_t old_max_mds = 0; /* Value to restore when MDS cluster is marked up */ mds_rank_t standby_count_wanted = -1; string balancer; /* The name/version of the mantle balancer (i.e. the rados obj name) */ std::set in; // currently defined cluster // which ranks are failed, stopped, damaged (i.e. not held by a daemon) std::set failed, stopped, damaged; std::map up; // who is in those roles std::map mds_info; uint8_t ever_allowed_features = 0; //< bitmap of features the cluster has allowed uint8_t explicitly_allowed_features = 0; //< bitmap of features explicitly enabled bool inline_data_enabled = false; uint64_t cached_up_features = 0; public: CompatSet compat; friend class MDSMonitor; friend class Filesystem; friend class FSMap; public: bool get_inline_data_enabled() const { return inline_data_enabled; } void set_inline_data_enabled(bool enabled) { inline_data_enabled = enabled; } utime_t get_session_timeout() const { return utime_t(session_timeout,0); } void set_session_timeout(uint32_t t) { session_timeout = t; } utime_t get_session_autoclose() const { return utime_t(session_autoclose, 0); } void set_session_autoclose(uint32_t t) { session_autoclose = t; } uint64_t get_max_filesize() const { return max_file_size; } void set_max_filesize(uint64_t m) { max_file_size = m; } uint8_t get_min_compat_client() const { return min_compat_client; } void set_min_compat_client(uint8_t version) { min_compat_client = version; } int get_flags() const { return flags; } bool test_flag(int f) const { return flags & f; } void set_flag(int f) { flags |= f; } void clear_flag(int f) { flags &= ~f; } std::string_view get_fs_name() const {return fs_name;} void set_snaps_allowed() { set_flag(CEPH_MDSMAP_ALLOW_SNAPS); ever_allowed_features |= CEPH_MDSMAP_ALLOW_SNAPS; explicitly_allowed_features |= CEPH_MDSMAP_ALLOW_SNAPS; } void clear_snaps_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_SNAPS); } bool allows_snaps() const { return test_flag(CEPH_MDSMAP_ALLOW_SNAPS); } bool was_snaps_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_ALLOW_SNAPS; } void set_multimds_snaps_allowed() { set_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS); ever_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS; explicitly_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS; } void clear_multimds_snaps_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS); } bool allows_multimds_snaps() const { return test_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS); } epoch_t get_epoch() const { return epoch; } void inc_epoch() { epoch++; } bool get_enabled() const { return enabled; } const utime_t& get_created() const { return created; } void set_created(utime_t ct) { modified = created = ct; } const utime_t& get_modified() const { return modified; } void set_modified(utime_t mt) { modified = mt; } epoch_t get_last_failure() const { return last_failure; } epoch_t get_last_failure_osd_epoch() const { return last_failure_osd_epoch; } mds_rank_t get_max_mds() const { return max_mds; } void set_max_mds(mds_rank_t m) { max_mds = m; } void set_old_max_mds() { old_max_mds = max_mds; } mds_rank_t get_old_max_mds() const { return old_max_mds; } mds_rank_t get_standby_count_wanted(mds_rank_t standby_daemon_count) const { assert(standby_daemon_count >= 0); std::set s; get_standby_replay_mds_set(s); mds_rank_t standbys_avail = (mds_rank_t)s.size()+standby_daemon_count; mds_rank_t wanted = std::max(0, standby_count_wanted); return wanted > standbys_avail ? wanted - standbys_avail : 0; } void set_standby_count_wanted(mds_rank_t n) { standby_count_wanted = n; } bool check_health(mds_rank_t standby_daemon_count); const std::string get_balancer() const { return balancer; } void set_balancer(std::string val) { balancer.assign(val); } mds_rank_t get_tableserver() const { return tableserver; } mds_rank_t get_root() const { return root; } const std::vector &get_data_pools() const { return data_pools; } int64_t get_first_data_pool() const { return *data_pools.begin(); } int64_t get_metadata_pool() const { return metadata_pool; } bool is_data_pool(int64_t poolid) const { auto p = std::find(data_pools.begin(), data_pools.end(), poolid); if (p == data_pools.end()) return false; return true; } bool pool_in_use(int64_t poolid) const { return get_enabled() && (is_data_pool(poolid) || metadata_pool == poolid); } const std::map& get_mds_info() const { return mds_info; } const mds_info_t& get_mds_info_gid(mds_gid_t gid) const { return mds_info.at(gid); } const mds_info_t& get_mds_info(mds_rank_t m) const { assert(up.count(m) && mds_info.count(up.at(m))); return mds_info.at(up.at(m)); } mds_gid_t find_mds_gid_by_name(std::string_view s) const { for (std::map::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) { if (p->second.name == s) { return p->first; } } return MDS_GID_NONE; } // counts unsigned get_num_in_mds() const { return in.size(); } unsigned get_num_up_mds() const { return up.size(); } mds_rank_t get_last_in_mds() const { auto p = in.rbegin(); return p == in.rend() ? MDS_RANK_NONE : *p; } int get_num_failed_mds() const { return failed.size(); } unsigned get_num_mds(int state) const { unsigned n = 0; for (std::map::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) if (p->second.state == state) ++n; return n; } // data pools void add_data_pool(int64_t poolid) { data_pools.push_back(poolid); } int remove_data_pool(int64_t poolid) { std::vector::iterator p = std::find(data_pools.begin(), data_pools.end(), poolid); if (p == data_pools.end()) return -ENOENT; data_pools.erase(p); return 0; } // sets void get_mds_set(std::set& s) const { s = in; } void get_up_mds_set(std::set& s) const { for (std::map::const_iterator p = up.begin(); p != up.end(); ++p) s.insert(p->first); } void get_active_mds_set(std::set& s) const { get_mds_set(s, MDSMap::STATE_ACTIVE); } void get_standby_replay_mds_set(std::set& s) const { get_mds_set(s, MDSMap::STATE_STANDBY_REPLAY); } void get_failed_mds_set(std::set& s) const { s = failed; } // features uint64_t get_up_features() { if (!cached_up_features) { bool first = true; for (std::map::const_iterator p = up.begin(); p != up.end(); ++p) { std::map::const_iterator q = mds_info.find(p->second); assert(q != mds_info.end()); if (first) { cached_up_features = q->second.mds_features; first = false; } else { cached_up_features &= q->second.mds_features; } } } return cached_up_features; } /** * Get MDS ranks which are in but not up. */ void get_down_mds_set(std::set *s) const { assert(s != NULL); s->insert(failed.begin(), failed.end()); s->insert(damaged.begin(), damaged.end()); } int get_failed() const { if (!failed.empty()) return *failed.begin(); return -1; } void get_stopped_mds_set(std::set& s) const { s = stopped; } void get_recovery_mds_set(std::set& s) const { s = failed; for (const auto& p : damaged) s.insert(p); for (const auto& p : mds_info) if (p.second.state >= STATE_REPLAY && p.second.state <= STATE_STOPPING) s.insert(p.second.rank); } void get_mds_set_lower_bound(std::set& s, DaemonState first) const { for (std::map::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) if (p->second.state >= first && p->second.state <= STATE_STOPPING) s.insert(p->second.rank); } void get_mds_set(std::set& s, DaemonState state) const { for (std::map::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) if (p->second.state == state) s.insert(p->second.rank); } void get_health(list >& summary, list > *detail) const; void get_health_checks(health_check_map_t *checks) const; typedef enum { AVAILABLE = 0, TRANSIENT_UNAVAILABLE = 1, STUCK_UNAVAILABLE = 2 } availability_t; /** * Return indication of whether cluster is available. This is a * heuristic for clients to see if they should bother waiting to talk to * MDSs, or whether they should error out at startup/mount. * * A TRANSIENT_UNAVAILABLE result indicates that the cluster is in a * transition state like replaying, or is potentially about the fail over. * Clients should wait for an updated map before making a final decision * about whether the filesystem is mountable. * * A STUCK_UNAVAILABLE result indicates that we can't see a way that * the cluster is about to recover on its own, so it'll probably require * administrator intervention: clients should probaly not bother trying * to mount. */ availability_t is_cluster_available() const; /** * Return whether this MDSMap is suitable for resizing based on the state * of the ranks. */ bool is_resizeable() const { return !is_degraded() && get_num_mds(CEPH_MDS_STATE_CREATING) == 0 && get_num_mds(CEPH_MDS_STATE_STARTING) == 0 && get_num_mds(CEPH_MDS_STATE_STOPPING) == 0; } // mds states bool is_down(mds_rank_t m) const { return up.count(m) == 0; } bool is_up(mds_rank_t m) const { return up.count(m); } bool is_in(mds_rank_t m) const { return up.count(m) || failed.count(m); } bool is_out(mds_rank_t m) const { return !is_in(m); } bool is_failed(mds_rank_t m) const { return failed.count(m); } bool is_stopped(mds_rank_t m) const { return stopped.count(m); } bool is_dne(mds_rank_t m) const { return in.count(m) == 0; } bool is_dne_gid(mds_gid_t gid) const { return mds_info.count(gid) == 0; } /** * Get MDS rank state if the rank is up, else STATE_NULL */ DaemonState get_state(mds_rank_t m) const { std::map::const_iterator u = up.find(m); if (u == up.end()) return STATE_NULL; return get_state_gid(u->second); } /** * Get MDS daemon status by GID */ DaemonState get_state_gid(mds_gid_t gid) const { std::map::const_iterator i = mds_info.find(gid); if (i == mds_info.end()) return STATE_NULL; return i->second.state; } const mds_info_t& get_info(const mds_rank_t m) const { return mds_info.at(up.at(m)); } const mds_info_t& get_info_gid(const mds_gid_t gid) const { return mds_info.at(gid); } bool is_boot(mds_rank_t m) const { return get_state(m) == STATE_BOOT; } bool is_creating(mds_rank_t m) const { return get_state(m) == STATE_CREATING; } bool is_starting(mds_rank_t m) const { return get_state(m) == STATE_STARTING; } bool is_replay(mds_rank_t m) const { return get_state(m) == STATE_REPLAY; } bool is_resolve(mds_rank_t m) const { return get_state(m) == STATE_RESOLVE; } bool is_reconnect(mds_rank_t m) const { return get_state(m) == STATE_RECONNECT; } bool is_rejoin(mds_rank_t m) const { return get_state(m) == STATE_REJOIN; } bool is_clientreplay(mds_rank_t m) const { return get_state(m) == STATE_CLIENTREPLAY; } bool is_active(mds_rank_t m) const { return get_state(m) == STATE_ACTIVE; } bool is_stopping(mds_rank_t m) const { return get_state(m) == STATE_STOPPING; } bool is_active_or_stopping(mds_rank_t m) const { return is_active(m) || is_stopping(m); } bool is_clientreplay_or_active_or_stopping(mds_rank_t m) const { return is_clientreplay(m) || is_active(m) || is_stopping(m); } bool is_followable(mds_rank_t m) const { return (is_resolve(m) || is_replay(m) || is_rejoin(m) || is_clientreplay(m) || is_active(m) || is_stopping(m)); } bool is_laggy_gid(mds_gid_t gid) const { if (!mds_info.count(gid)) return false; std::map::const_iterator p = mds_info.find(gid); return p->second.laggy(); } // degraded = some recovery in process. fixes active membership and // recovery_set. bool is_degraded() const { if (!failed.empty() || !damaged.empty()) return true; for (std::map::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) if (p->second.state >= STATE_REPLAY && p->second.state <= STATE_CLIENTREPLAY) return true; return false; } bool is_any_failed() const { return failed.size(); } bool is_resolving() const { return get_num_mds(STATE_RESOLVE) > 0 && get_num_mds(STATE_REPLAY) == 0 && failed.empty() && damaged.empty(); } bool is_rejoining() const { // nodes are rejoining cache state return get_num_mds(STATE_REJOIN) > 0 && get_num_mds(STATE_REPLAY) == 0 && get_num_mds(STATE_RECONNECT) == 0 && get_num_mds(STATE_RESOLVE) == 0 && failed.empty() && damaged.empty(); } bool is_stopped() const { return up.empty(); } /** * Get whether a rank is 'up', i.e. has * an MDS daemon's entity_inst_t associated * with it. */ bool have_inst(mds_rank_t m) const { return up.count(m); } /** * Get the MDS daemon entity_inst_t for a rank * known to be up. */ entity_addrvec_t get_addrs(mds_rank_t m) const { return mds_info.at(up.at(m)).get_addrs(); } mds_rank_t get_rank_gid(mds_gid_t gid) const { if (mds_info.count(gid)) { return mds_info.at(gid).rank; } else { return MDS_RANK_NONE; } } int get_inc_gid(mds_gid_t gid) const { auto mds_info_entry = mds_info.find(gid); if (mds_info_entry != mds_info.end()) return mds_info_entry->second.inc; return -1; } void encode(bufferlist& bl, uint64_t features) const; void decode(bufferlist::const_iterator& p); void decode(const bufferlist& bl) { auto p = bl.cbegin(); decode(p); } void sanitize(const std::function& pool_exists); void print(ostream& out) const; void print_summary(Formatter *f, ostream *out) const; void dump(Formatter *f) const; static void generate_test_instances(list& ls); static bool state_transition_valid(DaemonState prev, DaemonState next); }; WRITE_CLASS_ENCODER_FEATURES(MDSMap::mds_info_t) WRITE_CLASS_ENCODER_FEATURES(MDSMap) inline ostream& operator<<(ostream &out, const MDSMap &m) { m.print_summary(NULL, &out); return out; } #endif