diff options
Diffstat (limited to 'src/messages')
-rw-r--r-- | src/messages/MClientCaps.h | 15 | ||||
-rw-r--r-- | src/messages/MMDSBeacon.h | 40 | ||||
-rw-r--r-- | src/messages/MMDSQuiesceDbAck.h | 17 | ||||
-rw-r--r-- | src/messages/MMDSQuiesceDbListing.h | 17 | ||||
-rw-r--r-- | src/messages/MMDSScrubStats.h | 109 | ||||
-rw-r--r-- | src/messages/MNVMeofGwMap.h | 2 | ||||
-rw-r--r-- | src/messages/MOSDPGPCT.h | 99 | ||||
-rw-r--r-- | src/messages/MOSDPGUpdateLogMissing.h | 28 | ||||
-rw-r--r-- | src/messages/MOSDRepOp.h | 47 |
9 files changed, 325 insertions, 49 deletions
diff --git a/src/messages/MClientCaps.h b/src/messages/MClientCaps.h index 96b2cb7d8b8..b001032225e 100644 --- a/src/messages/MClientCaps.h +++ b/src/messages/MClientCaps.h @@ -117,9 +117,9 @@ private: void set_ctime(const utime_t &t) { ctime = t; } void set_atime(const utime_t &t) { atime = t; } - void set_cap_peer(uint64_t id, ceph_seq_t seq, ceph_seq_t mseq, int mds, int flags) { + void set_cap_peer(uint64_t id, ceph_seq_t issue_seq, ceph_seq_t mseq, int mds, int flags) { peer.cap_id = id; - peer.seq = seq; + peer.issue_seq = issue_seq; peer.mseq = mseq; peer.mds = mds; peer.flags = flags; @@ -137,11 +137,12 @@ protected: inodeno_t ino, inodeno_t realm, uint64_t id, - long seq, + ceph_seq_t seq, int caps, int wanted, int dirty, - int mseq, + ceph_seq_t mseq, + ceph_seq_t issue_seq, epoch_t oeb) : SafeMessage{CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION}, osd_epoch_barrier(oeb) { @@ -155,11 +156,12 @@ protected: head.wanted = wanted; head.dirty = dirty; head.migrate_seq = mseq; + head.issue_seq = issue_seq; memset(&peer, 0, sizeof(peer)); } MClientCaps(int op, inodeno_t ino, inodeno_t realm, - uint64_t id, int mseq, epoch_t oeb) + uint64_t id, ceph_seq_t mseq, epoch_t oeb) : SafeMessage{CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION}, osd_epoch_barrier(oeb) { memset(&head, 0, sizeof(head)); @@ -181,7 +183,8 @@ public: out << "client_caps(" << ceph_cap_op_name(head.op) << " ino " << inodeno_t(head.ino) << " " << head.cap_id - << " seq " << head.seq; + << " seq " << head.seq + << " issue_seq " << head.issue_seq; if (get_tid()) out << " tid " << get_tid(); out << " caps=" << ccap_string(head.caps) diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h index b87c3153af3..526285aae8c 100644 --- a/src/messages/MMDSBeacon.h +++ b/src/messages/MMDSBeacon.h @@ -48,6 +48,7 @@ enum mds_metric_t { MDS_HEALTH_CLIENTS_LAGGY, MDS_HEALTH_CLIENTS_LAGGY_MANY, MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH, + MDS_HEALTH_ESTIMATED_REPLAY_TIME, MDS_HEALTH_DUMMY, // not a real health warning, for testing }; @@ -69,6 +70,7 @@ inline const char *mds_metric_name(mds_metric_t m) case MDS_HEALTH_CLIENTS_LAGGY: return "MDS_CLIENTS_LAGGY"; case MDS_HEALTH_CLIENTS_LAGGY_MANY: return "MDS_CLIENTS_LAGGY_MANY"; case MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH: return "MDS_CLIENTS_BROKEN_ROOTSQUASH"; + case MDS_HEALTH_ESTIMATED_REPLAY_TIME: return "MDS_ESTIMATED_REPLAY_TIME"; case MDS_HEALTH_DUMMY: return "MDS_DUMMY"; default: return "???"; @@ -107,6 +109,8 @@ inline const char *mds_metric_summary(mds_metric_t m) return "%num% client(s) laggy due to laggy OSDs"; case MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH: return "%num% MDS report clients with broken root_squash implementation"; + case MDS_HEALTH_ESTIMATED_REPLAY_TIME: + return "%num% estimated journal replay time"; default: return "???"; } @@ -155,6 +159,25 @@ struct MDSHealthMetric DECODE_FINISH(bl); } + void dump(ceph::Formatter *f) const { + f->dump_string("type", mds_metric_name(type)); + f->dump_stream("sev") << sev; + f->dump_string("message", message); + f->open_object_section("metadata"); + for (auto& i : metadata) { + f->dump_string(i.first.c_str(), i.second); + } + f->close_section(); + } + + static void generate_test_instances(std::list<MDSHealthMetric*>& ls) { + ls.push_back(new MDSHealthMetric()); + ls.back()->type = MDS_HEALTH_CACHE_OVERSIZED; + ls.push_back(new MDSHealthMetric(MDS_HEALTH_TRIM, HEALTH_WARN, "MDS is behind on trimming")); + ls.back()->metadata["mds"] = "a"; + ls.back()->metadata["num"] = "1"; + } + bool operator==(MDSHealthMetric const &other) const { return (type == other.type && sev == other.sev && message == other.message); @@ -187,6 +210,23 @@ struct MDSHealth DECODE_FINISH(bl); } + void dump(ceph::Formatter *f) const { + f->open_array_section("metrics"); + for (auto& i : metrics) { + f->open_object_section("metric"); + i.dump(f); + f->close_section(); + } + f->close_section(); + } + + static void generate_test_instances(std::list<MDSHealth*>& ls) { + ls.push_back(new MDSHealth); + ls.push_back(new MDSHealth); + ls.back()->metrics.push_back(MDSHealthMetric(MDS_HEALTH_TRIM, HEALTH_WARN, + "MDS is behind on trimming")); + } + bool operator==(MDSHealth const &other) const { return metrics == other.metrics; diff --git a/src/messages/MMDSQuiesceDbAck.h b/src/messages/MMDSQuiesceDbAck.h index 48bc5e37a86..270802268a4 100644 --- a/src/messages/MMDSQuiesceDbAck.h +++ b/src/messages/MMDSQuiesceDbAck.h @@ -21,21 +21,20 @@ class MMDSQuiesceDbAck final : public MMDSOp { protected: MMDSQuiesceDbAck() : MMDSOp{MSG_MDS_QUIESCE_DB_ACK} {} + MMDSQuiesceDbAck(auto&& _ack) + : MMDSOp{MSG_MDS_QUIESCE_DB_ACK} + , ack(std::forward<decltype(_ack)>(_ack)) + {} ~MMDSQuiesceDbAck() final {} public: std::string_view get_type_name() const override { return "mds_quiesce_db_ack"; } void print(std::ostream& o) const override { - + o << get_type_name(); } void encode_payload(uint64_t features) override { - // noop to prevent unnecessary overheads - } - - void encode_payload_from(QuiesceDbPeerAck const& ack) - { ::encode(ack, payload); } @@ -43,10 +42,10 @@ public: // noop to prevent unnecessary overheads } - void decode_payload_into(QuiesceDbPeerAck &ack) const + void decode_payload_into(QuiesceDbPeerAck &_ack) const { auto p = payload.cbegin(); - ::decode(ack, p); + ::decode(_ack, p); } private: @@ -54,4 +53,6 @@ private: friend boost::intrusive_ptr<T> ceph::make_message(Args&&... args); template<class T, typename... Args> friend MURef<T> crimson::make_message(Args&&... args); + + QuiesceDbPeerAck ack; }; diff --git a/src/messages/MMDSQuiesceDbListing.h b/src/messages/MMDSQuiesceDbListing.h index 39d72fb8eb4..b3be5bb8f22 100644 --- a/src/messages/MMDSQuiesceDbListing.h +++ b/src/messages/MMDSQuiesceDbListing.h @@ -21,20 +21,19 @@ class MMDSQuiesceDbListing final : public MMDSOp { protected: MMDSQuiesceDbListing() : MMDSOp{MSG_MDS_QUIESCE_DB_LISTING} {} + MMDSQuiesceDbListing(auto&& _pl) + : MMDSOp{MSG_MDS_QUIESCE_DB_LISTING} + , peer_listing(std::forward<decltype(_pl)>(_pl)) + {} ~MMDSQuiesceDbListing() final {} public: std::string_view get_type_name() const override { return "mds_quiesce_db_listing"; } void print(std::ostream& o) const override { - + o << get_type_name(); } void encode_payload(uint64_t features) override { - // noop to prevent unnecessary overheads - } - - void encode_payload_from(QuiesceDbPeerListing const& peer_listing) - { ::encode(peer_listing, payload); } @@ -42,10 +41,10 @@ public: // noop to prevent unnecessary overheads } - void decode_payload_into(QuiesceDbPeerListing &peer_listing) const + void decode_payload_into(QuiesceDbPeerListing &pl) const { auto p = payload.cbegin(); - ::decode(peer_listing, p); + ::decode(pl, p); } private: @@ -53,4 +52,6 @@ private: friend boost::intrusive_ptr<T> ceph::make_message(Args&&... args); template<class T, typename... Args> friend MURef<T> crimson::make_message(Args&&... args); + + QuiesceDbPeerListing peer_listing; }; diff --git a/src/messages/MMDSScrubStats.h b/src/messages/MMDSScrubStats.h index 2cbb7f2f211..9252f99ac8a 100644 --- a/src/messages/MMDSScrubStats.h +++ b/src/messages/MMDSScrubStats.h @@ -18,7 +18,7 @@ #include "messages/MMDSOp.h" class MMDSScrubStats : public MMDSOp { - static constexpr int HEAD_VERSION = 1; + static constexpr int HEAD_VERSION = 2; static constexpr int COMPAT_VERSION = 1; public: @@ -38,6 +38,15 @@ public: bool is_finished(const std::string& tag) const { return update_scrubbing && !scrubbing_tags.count(tag); } + const std::unordered_map<std::string, std::unordered_map<int, std::vector<_inodeno_t>>>& get_uninline_failed_meta_info() const { + return uninline_failed_meta_info; + } + const std::unordered_map<_inodeno_t, std::string>& get_paths() const { + return paths; + } + const std::unordered_map<std::string, std::vector<uint64_t>>& get_counters() const { + return counters; + } void encode_payload(uint64_t features) override { using ceph::encode; @@ -45,6 +54,7 @@ public: encode(scrubbing_tags, payload); encode(update_scrubbing, payload); encode(aborting, payload); + encode_uninline_failed_info(); } void decode_payload() override { using ceph::decode; @@ -53,6 +63,90 @@ public: decode(scrubbing_tags, p); decode(update_scrubbing, p); decode(aborting, p); + if (header.version >= 2) { + decode_uninline_failed_info(p); + } + } + + void encode_uninline_failed_info() { + using ceph::encode; + int count = (int)uninline_failed_meta_info.size(); + encode(count, payload); + for (const auto& [tag, meta_info_map] : uninline_failed_meta_info) { + encode(tag, payload); + count = (int)meta_info_map.size(); + encode(count, payload); + for (const auto& [error_code, ino_vec] : meta_info_map) { + encode(error_code, payload); + encode(ino_vec, payload); + } + } + count = (int)paths.size(); + encode(count, payload); + for (auto& [ino, path] : paths) { + encode(ino, payload); + encode(path, payload); + } + count = (int)counters.size(); + encode(count, payload); + for (auto& [tag, v] : counters) { + encode(tag, payload); + uint64_t started = v[0]; + uint64_t passed = v[1]; + uint64_t failed = v[2]; + uint64_t skipped = v[3]; + + encode(started, payload); + encode(passed, payload); + encode(failed, payload); + encode(skipped, payload); + } + } + void decode_uninline_failed_info(ceph::bufferlist::const_iterator& p) { + using ceph::decode; + int tag_count = 0; + decode(tag_count, p); + while (tag_count--) { + std::string tag; + decode(tag, p); + int count = 0; + decode(count, p); + std::unordered_map<int, std::vector<_inodeno_t>> uninline_failed_info; + while (count--) { + int error_code; + std::vector<_inodeno_t> ino_vec; + decode(error_code, p); + decode(ino_vec, p); + uninline_failed_info[error_code] = std::move(ino_vec); + } + uninline_failed_meta_info[tag] = std::move(uninline_failed_info); + } + int count = 0; + decode(count, p); + while (count--) { + _inodeno_t ino; + std::string path; + decode(ino, p); + decode(path, p); + paths[ino] = path; + } + count = 0; + decode(count, p); + while (count--) { + std::string tag; + decode(tag, p); + uint64_t started = 0; + uint64_t passed = 0; + uint64_t failed = 0; + uint64_t skipped = 0; + + decode(started, p); + decode(passed, p); + decode(failed, p); + decode(skipped, p); + std::vector<uint64_t> c{started, passed, failed, skipped}; + counters[tag] = c; + } } protected: @@ -65,6 +159,15 @@ protected: MMDSScrubStats(unsigned e, const std::set<std::string>& tags, bool abrt=false) : MMDSOp(MSG_MDS_SCRUB_STATS, HEAD_VERSION, COMPAT_VERSION), epoch(e), scrubbing_tags(tags), update_scrubbing(true), aborting(abrt) {} + MMDSScrubStats(unsigned e, const std::set<std::string>& tags, + std::unordered_map<std::string, std::unordered_map<int, std::vector<_inodeno_t>>>&& ufmi, + std::unordered_map<_inodeno_t, std::string>&& paths_, + std::unordered_map<std::string, std::vector<uint64_t>>&& counters_, + bool abrt = false) : + MMDSOp(MSG_MDS_SCRUB_STATS, HEAD_VERSION, COMPAT_VERSION), + epoch(e), scrubbing_tags(tags), update_scrubbing(true), aborting(abrt), + uninline_failed_meta_info(std::move(ufmi)), paths(std::move(paths_)), + counters(std::move(counters_)) {} ~MMDSScrubStats() override {} private: @@ -72,6 +175,10 @@ private: std::set<std::string> scrubbing_tags; bool update_scrubbing = false; bool aborting = false; + // <tag, <error_code, [ino1, ino2, ...]>> + std::unordered_map<std::string, std::unordered_map<int, std::vector<_inodeno_t>>> uninline_failed_meta_info; + std::unordered_map<_inodeno_t, std::string> paths; + std::unordered_map<std::string, std::vector<uint64_t>> counters; template<class T, typename... Args> friend boost::intrusive_ptr<T> ceph::make_message(Args&&... args); diff --git a/src/messages/MNVMeofGwMap.h b/src/messages/MNVMeofGwMap.h index 3affdd250dc..efa0e91cbe4 100644 --- a/src/messages/MNVMeofGwMap.h +++ b/src/messages/MNVMeofGwMap.h @@ -56,7 +56,7 @@ public: using ceph::encode; encode(VERSION, payload); encode(gwmap_epoch, payload); - encode(map, payload); + encode(map, payload, features); } private: using RefCountedObject::put; diff --git a/src/messages/MOSDPGPCT.h b/src/messages/MOSDPGPCT.h new file mode 100644 index 00000000000..b3f88314ec3 --- /dev/null +++ b/src/messages/MOSDPGPCT.h @@ -0,0 +1,99 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2024 IBM, Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + + +#include "MOSDFastDispatchOp.h" + +class MOSDPGPCT final : public MOSDFastDispatchOp { +private: + static constexpr int HEAD_VERSION = 1; + static constexpr int COMPAT_VERSION = 1; + +public: + /// epoch at which the message was sent + epoch_t map_epoch = 0; + + /// start epoch of the interval in which the message was sent + epoch_t min_epoch = 0; + + /// target pg + spg_t pgid; + + /** + * pg_committed_to + * + * Propagates PeeringState::pg_committed_to to replicas as with + * MOSDRepOp, ECSubWrite, MOSDPGPCT. + */ + eversion_t pg_committed_to; + + epoch_t get_map_epoch() const override { + return map_epoch; + } + epoch_t get_min_epoch() const override { + return min_epoch; + } + spg_t get_spg() const override { + return pgid; + } + + MOSDPGPCT() + : MOSDFastDispatchOp{MSG_OSD_PG_PCT, HEAD_VERSION, + COMPAT_VERSION} {} + MOSDPGPCT( + spg_t pgid, + epoch_t epoch, + epoch_t min_epoch, + eversion_t pg_committed_to) + : MOSDFastDispatchOp{MSG_OSD_PG_PCT, HEAD_VERSION, + COMPAT_VERSION}, + map_epoch(epoch), + min_epoch(min_epoch), + pgid(pgid), + pg_committed_to(pg_committed_to) + {} + +private: + ~MOSDPGPCT() final {} + +public: + std::string_view get_type_name() const override { return "PGPCT"; } + void print(std::ostream& out) const override { + out << "pg_pct(" << pgid << " epoch " << map_epoch + << "/" << min_epoch + << " pg_committed_to " << pg_committed_to + << ")"; + } + + void encode_payload(uint64_t features) override { + using ceph::encode; + encode(map_epoch, payload); + encode(min_epoch, payload); + encode(pgid, payload); + encode(pg_committed_to, payload); + } + void decode_payload() override { + using ceph::decode; + auto p = payload.cbegin(); + decode(map_epoch, p); + decode(min_epoch, p); + decode(pgid, p); + decode(pg_committed_to, p); + } +private: + template<class T, typename... Args> + friend boost::intrusive_ptr<T> ceph::make_message(Args&&... args); +}; diff --git a/src/messages/MOSDPGUpdateLogMissing.h b/src/messages/MOSDPGUpdateLogMissing.h index 2a0011e8fb7..ebe678c6c31 100644 --- a/src/messages/MOSDPGUpdateLogMissing.h +++ b/src/messages/MOSDPGUpdateLogMissing.h @@ -31,7 +31,23 @@ public: mempool::osd_pglog::list<pg_log_entry_t> entries; // piggybacked osd/pg state eversion_t pg_trim_to; // primary->replica: trim to here - eversion_t pg_roll_forward_to; // primary->replica: trim rollback info to here + + /** + * pg_committed_to + * + * Propagates PeeringState::pg_committed_to to replicas as with + * MOSDRepOp, ECSubWrite + * + * Historical Note: Prior to early 2024, this field was named + * pg_roll_forward_to. pg_committed_to is a safe value to rollforward to as + * it is a conservative bound on versions that can become divergent. Switching + * it to be populated by pg_committed_to rather than mlcod mirrors MOSDRepOp + * and upgrade cases in both directions should be safe as mlcod is <= pct + * and replicas (both ec and replicated) only actually rely on versions <= this + * field being non-divergent. This note may be removed in main after U is + * released. + */ + eversion_t pg_committed_to; epoch_t get_epoch() const { return map_epoch; } spg_t get_pgid() const { return pgid; } @@ -59,7 +75,7 @@ public: epoch_t min_epoch, ceph_tid_t rep_tid, eversion_t pg_trim_to, - eversion_t pg_roll_forward_to) + eversion_t pg_committed_to) : MOSDFastDispatchOp{MSG_OSD_PG_UPDATE_LOG_MISSING, HEAD_VERSION, COMPAT_VERSION}, map_epoch(epoch), @@ -69,7 +85,7 @@ public: rep_tid(rep_tid), entries(entries), pg_trim_to(pg_trim_to), - pg_roll_forward_to(pg_roll_forward_to) + pg_committed_to(pg_committed_to) {} private: @@ -83,7 +99,7 @@ public: << " rep_tid " << rep_tid << " entries " << entries << " trim_to " << pg_trim_to - << " roll_forward_to " << pg_roll_forward_to + << " pg_committed_to " << pg_committed_to << ")"; } @@ -96,7 +112,7 @@ public: encode(entries, payload); encode(min_epoch, payload); encode(pg_trim_to, payload); - encode(pg_roll_forward_to, payload); + encode(pg_committed_to, payload); } void decode_payload() override { using ceph::decode; @@ -113,7 +129,7 @@ public: } if (header.version >= 3) { decode(pg_trim_to, p); - decode(pg_roll_forward_to, p); + decode(pg_committed_to, p); } } private: diff --git a/src/messages/MOSDRepOp.h b/src/messages/MOSDRepOp.h index ecfe3294d1c..5e8b386ba0a 100644 --- a/src/messages/MOSDRepOp.h +++ b/src/messages/MOSDRepOp.h @@ -54,7 +54,30 @@ public: // piggybacked osd/og state eversion_t pg_trim_to; // primary->replica: trim to here - eversion_t min_last_complete_ondisk; // lower bound on committed version + + /** + * pg_committed_to + * + * Used by the primary to propagate pg_committed_to to replicas for use in + * serving replica reads. + * + * Because updates <= pg_committed_to cannot become divergent, replicas + * may safely serve reads on objects which do not have more recent updates. + * + * See PeeringState::pg_committed_to, PeeringState::can_serve_replica_read + * + * Historical note: Prior to early 2024, this field was named + * min_last_complete_ondisk. The replica, however, only actually relied on + * a single property of this field -- that any objects not modified since + * mlcod couldn't have uncommitted state. Weakening the field to the condition + * above is therefore safe -- mlcod is always <= pg_committed_to and + * sending pg_committed_to to a replica expecting mlcod will work correctly + * as it only actually uses mlcod to check replica reads. The primary difference + * between mlcod and pg_committed_to is simply that mlcod doesn't advance past + * objects missing on replicas, but we check for that anyway. This note may be + * removed in main after U is released. + */ + eversion_t pg_committed_to; hobject_t new_temp_oid; ///< new temp object that we must now start tracking hobject_t discard_temp_oid; ///< previously used temp object that we can now stop tracking @@ -110,14 +133,8 @@ public: decode(from, p); decode(updated_hit_set_history, p); - if (header.version >= 3) { - decode(min_last_complete_ondisk, p); - } else { - /* This field used to mean pg_roll_foward_to, but ReplicatedBackend - * simply assumes that we're rolling foward to version. */ - eversion_t pg_roll_forward_to; - decode(pg_roll_forward_to, p); - } + ceph_assert(header.version >= 3); + decode(pg_committed_to, p); final_decode_needed = false; } @@ -141,7 +158,7 @@ public: encode(discard_temp_oid, payload); encode(from, payload); encode(updated_hit_set_history, payload); - encode(min_last_complete_ondisk, payload); + encode(pg_committed_to, payload); } MOSDRepOp() @@ -164,10 +181,6 @@ public: set_tid(rtid); } - void set_rollback_to(const eversion_t &rollback_to) { - header.version = 2; - min_last_complete_ondisk = rollback_to; - } private: ~MOSDRepOp() final {} @@ -180,11 +193,7 @@ public: out << " " << poid << " v " << version; if (updated_hit_set_history) out << ", has_updated_hit_set_history"; - if (header.version < 3) { - out << ", rollback_to(legacy)=" << min_last_complete_ondisk; - } else { - out << ", mlcod=" << min_last_complete_ondisk; - } + out << ", pct=" << pg_committed_to; } out << ")"; } |