1 files changed, 116 insertions, 80 deletions
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h
index e970bd7219d..0d9e8c1e9f6 100644
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -75,6 +75,8 @@ Main Scrubber interfaces:
 #include <string_view>
 #include <vector>
 
+#include "common/config_proxy.h"
+#include "common/config_cacher.h"
 #include "osd/PG.h"
 #include "osd/scrubber_common.h"
 
@@ -143,29 +145,48 @@ struct scrub_flags_t {
   unsigned int priority{0};
 
   /**
-   * set by queue_scrub() if either planned_scrub.auto_repair or
-   * need_auto were set.
-   * Tested at scrub end.
+   * set by set_op_parameters() for deep scrubs, if the hardware
+   * supports auto repairing and osd_scrub_auto_repair is enabled.
    */
   bool auto_repair{false};
 
   /// this flag indicates that we are scrubbing post repair to verify everything
-  /// is fixed
+  /// is fixed (otherwise - PG_STATE_FAILED_REPAIR will be asserted.)
+  /// Update (July 2024): now reflects an 'after-repair' urgency.
   bool check_repair{false};
 
   /// checked at the end of the scrub, to possibly initiate a deep-scrub
   bool deep_scrub_on_error{false};
-
-  /**
-   * scrub must not be aborted.
-   * Set for explicitly requested scrubs, and for scrubs originated by the
-   * pairing process with the 'repair' flag set (in the RequestScrub event).
-   */
-  bool required{false};
 };
 
 ostream& operator<<(ostream& out, const scrub_flags_t& sf);
 
+namespace fmt {
+template <>
+struct formatter<scrub_flags_t> {
+  constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
+  template <typename FormatContext>
+  auto format(const scrub_flags_t& sf, FormatContext& ctx) const
+  {
+    std::string txt;
+    bool sep{false};
+    if (sf.auto_repair) {
+      txt = "auto-repair";
+      sep = true;
+    }
+    if (sf.check_repair) {
+      txt += sep ? ",check-repair" : "check-repair";
+      sep = true;
+    }
+    if (sf.deep_scrub_on_error) {
+      txt += sep ? ",deep-scrub-on-error" : "deep-scrub-on-error";
+      sep = true;
+    }
+    return fmt::format_to(ctx.out(), "{}", txt);
+  }
+};
+}  // namespace fmt
+
 
 /**
  * The part of PG-scrubbing code that isn't state-machine wiring.
@@ -188,15 +209,12 @@ class PgScrubber : public ScrubPgIF,
   [[nodiscard]] bool is_reserving() const final;
 
   Scrub::schedule_result_t start_scrub_session(
-      std::unique_ptr<Scrub::ScrubJob> candidate,
+      scrub_level_t s_or_d,
       Scrub::OSDRestrictions osd_restrictions,
-      Scrub::ScrubPGPreconds pg_cond,
-      const requested_scrub_t& requested_flags) final;
+      Scrub::ScrubPGPreconds pg_cond) final;
 
   void initiate_regular_scrub(epoch_t epoch_queued) final;
 
-  void initiate_scrub_after_repair(epoch_t epoch_queued) final;
-
   void send_scrub_resched(epoch_t epoch_queued) final;
 
   void active_pushes_notification(epoch_t epoch_queued) final;
@@ -264,8 +282,15 @@ class PgScrubber : public ScrubPgIF,
 
   scrub_level_t scrub_requested(
       scrub_level_t scrub_level,
-      scrub_type_t scrub_type,
-      requested_scrub_t& req_flags) final;
+      scrub_type_t scrub_type) final;
+
+  /**
+   * let the scrubber know that a recovery operation has completed.
+   * This might trigger an 'after repair' scrub.
+   */
+  void recovery_completed() final;
+
+  bool is_after_repair_required() const final;
 
   /**
    * Reserve local scrub resources (managed by the OSD)
@@ -273,7 +298,7 @@ class PgScrubber : public ScrubPgIF,
    * Fails if OSD's local-scrubs budget was exhausted
    * \returns were local resources reserved?
    */
-  bool reserve_local() final;
+  bool reserve_local(const Scrub::SchedTarget& trgt);
 
   void handle_query_state(ceph::Formatter* f) final;
 
@@ -286,11 +311,9 @@ class PgScrubber : public ScrubPgIF,
 
   void on_operator_forced_scrub(
     ceph::Formatter* f,
-    scrub_level_t scrub_level,
-    requested_scrub_t& request_flags) final;
+    scrub_level_t scrub_level) final;
 
-  void dump_scrubber(ceph::Formatter* f,
-		     const requested_scrub_t& request_flags) const final;
+  void dump_scrubber(ceph::Formatter* f) const final;
 
   // used if we are a replica
 
@@ -339,12 +362,8 @@ class PgScrubber : public ScrubPgIF,
 
   /**
    * finalize the parameters of the initiated scrubbing session:
-   *
-   * The "current scrub" flags (m_flags) are set from the 'planned_scrub'
-   * flag-set; PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB &
-   * PG_STATE_REPAIR are set.
    */
-  void set_op_parameters(const requested_scrub_t& request) final;
+  void set_op_parameters(Scrub::ScrubPGPreconds pg_cond) final;
 
   void cleanup_store(ObjectStore::Transaction* t) final;
 
@@ -398,8 +417,8 @@ class PgScrubber : public ScrubPgIF,
     return m_pg->recovery_state.is_primary();
   }
 
-  /// is this scrub more than just regular periodic scrub?
-  [[nodiscard]] bool is_high_priority() const final;
+  /// is this scrub's urgency high enough, or must it reserve its replicas?
+  [[nodiscard]] bool is_reservation_required() const final;
 
   void set_state_name(const char* name) final
   {
@@ -506,7 +525,12 @@ class PgScrubber : public ScrubPgIF,
   /// control the order of construction/destruction.
   std::optional<Scrub::ScrubJob> m_scrub_job;
 
-  ostream& show(ostream& out) const override;
+
+  /// the scrubber has initiated a recovery, and is waiting for the recovery
+  /// to complete (in order to perform an 'after-repair' scrub)
+  bool m_after_repair_scrub_required{false};
+
+  ostream& show_concise(ostream& out) const override;
 
  public:
   //  ------------------  the I/F used by the ScrubBackend (ScrubBeListener)
@@ -565,14 +589,27 @@ class PgScrubber : public ScrubPgIF,
   void run_callbacks();
 
   // 'query' command data for an active scrub
-  void dump_active_scrubber(ceph::Formatter* f, bool is_deep) const;
+  void dump_active_scrubber(ceph::Formatter* f) const;
+
+  /**
+   * Used as a parameter of requeue_penalized() to indicate whether the
+   * both targets of this PG should be delayed (and not just the named one).
+   */
+  enum class delay_both_targets_t { no, yes };
 
   /**
    * move the 'not before' to a later time (with a delay amount that is
    * based on the delay cause). Also saves the cause.
-   * Pushes the updated scrub-job into the OSD's queue.
+   * Pushes the updated scheduling entry into the OSD's queue.
+   * @param s_or_d - the specific target (shallow or deep) to delay;
+   * @param delay_both - should both targets be delayed? note - the
+   *  'other' target will not be delayed if it has higher priority.
    */
-  void requeue_penalized(Scrub::delay_cause_t cause);
+  void requeue_penalized(
+      scrub_level_t s_or_d,
+      delay_both_targets_t delay_both,
+      Scrub::delay_cause_t cause,
+      utime_t scrub_clock_now);
 
   // -----     methods used to verify the relevance of incoming events:
 
@@ -706,12 +743,14 @@ class PgScrubber : public ScrubPgIF,
   bool m_publish_sessions{false};  //< will the counter be part of 'query'
 				   //output?
 
+  /**
+   * the scrub operation flags.
+   * Set at scrub start. Checked in multiple locations - mostly
+   * at finish.
+   * Note: replicas only use the 'priority' field.
+   */
   scrub_flags_t m_flags;
 
-  /// a reference to the details of the next scrub (as requested and managed by
-  /// the PG)
-  requested_scrub_t& m_planned_scrub;
-
   bool m_active{false};
 
   /**
@@ -732,19 +771,24 @@ class PgScrubber : public ScrubPgIF,
    */
   bool m_queued_or_active{false};
 
-  /**
-   * A copy of the specific scheduling target (either shallow_target or
-   * deep_target in the scrub_job) that was selected for this active scrub
-   * session.
-   * \ATTN: in this initial step - a copy of the whole scrub-job is passed
-   * around. Later on this would be just a part of a Scrub::SchedTarget
-   */
-  std::unique_ptr<Scrub::ScrubJob> m_active_target;
+  /// A copy of the specific scheduling target (either shallow_target or
+  /// deep_target in the scrub_job) that was selected for this active scrub
+  std::optional<Scrub::SchedTarget> m_active_target;
 
   eversion_t m_subset_last_update{};
 
   std::unique_ptr<Scrub::Store> m_store;
 
+  /**
+   * the ScrubStore sub-object caches and manages the database of known
+   * scrub errors. reinit_scrub_store() clears the database and re-initializes
+   * the ScrubStore object.
+   *
+   * in the next iteration - reinit_..() potentially deletes only the
+   * shallow errors part of the database.
+   */
+  void reinit_scrub_store();
+
   int num_digest_updates_pending{0};
   hobject_t m_start, m_end;  ///< note: half-closed: [start,end)
 
@@ -800,7 +844,7 @@ class PgScrubber : public ScrubPgIF,
   /**
    * initiate a deep-scrub after the current scrub ended with errors.
    */
-  void request_rescrubbing(requested_scrub_t& req_flags);
+  void request_rescrubbing();
 
   /**
    * combine cluster & pool configuration options into a single struct
@@ -809,40 +853,11 @@ class PgScrubber : public ScrubPgIF,
   Scrub::sched_conf_t populate_config_params() const;
 
   /**
-   * determine the time when the next scrub should be scheduled
-   *
-   * based on the planned scrub's flags, time of last scrub, and
-   * the pool's scrub configuration. This is only an initial "proposal",
-   * and will be further adjusted based on the scheduling parameters.
-   */
-  Scrub::sched_params_t determine_initial_schedule(
-      const Scrub::sched_conf_t& app_conf,
-      utime_t scrub_clock_now) const;
-
-  /// should we perform deep scrub?
-  bool is_time_for_deep(
-      Scrub::ScrubPGPreconds pg_cond,
-      const requested_scrub_t& planned) const;
-
-  /**
-   * Validate the various 'next scrub' flags against configuration
-   * and scrub-related timestamps.
-   *
-   * @returns an updated copy of the m_planned_flags (or nothing if no scrubbing)
+   * recompute the two ScrubJob targets, taking into account not
+   * only the up-to-date 'last' stamps, but also the 'urgency'
+   * attributes of both targets.
    */
-  std::optional<requested_scrub_t> validate_scrub_mode(
-      Scrub::OSDRestrictions osd_restrictions,
-      Scrub::ScrubPGPreconds pg_cond) const;
-
-  std::optional<requested_scrub_t> validate_periodic_mode(
-      Scrub::ScrubPGPreconds pg_cond,
-      bool time_for_deep,
-      const requested_scrub_t& planned) const;
-
-  std::optional<requested_scrub_t> validate_initiated_scrub(
-      Scrub::ScrubPGPreconds pg_cond,
-      bool time_for_deep,
-      const requested_scrub_t& planned) const;
+  void update_targets(utime_t scrub_clock_now);
 
   /*
    * Select a range of objects to scrub.
@@ -882,6 +897,24 @@ class PgScrubber : public ScrubPgIF,
   // scrub state.
   ceph::coarse_real_clock::time_point m_last_stat_upd{};
 
+  // ------------------ cached (frequently used) configuration values
+
+  /// initial (& max) number of objects to scrub in one pass - deep scrub
+  md_config_cacher_t<int64_t> osd_scrub_chunk_max;
+  /// initial (& max) number of objects to scrub in one pass - shallow
+  md_config_cacher_t<int64_t> osd_shallow_scrub_chunk_max;
+
+  /// chunk size won't be reduced (when preempted) below this
+  /// value (deep scrub)
+  md_config_cacher_t<int64_t> osd_scrub_chunk_min;
+  /// chunk size won't be reduced below this value (shallow scrub)
+  md_config_cacher_t<int64_t> osd_shallow_scrub_chunk_min;
+
+  /// stats update (publish_stats_to_osd()) interval while scrubbing
+  md_config_cacher_t<int64_t> osd_stats_update_period_scrubbing;
+  /// stats update interval while not scrubbing
+  md_config_cacher_t<int64_t> osd_stats_update_period_not_scrubbing;
+
   // ------------ members used if we are a replica
 
   epoch_t m_replica_min_epoch;	///< the min epoch needed to handle this message
@@ -978,6 +1011,9 @@ class PgScrubber : public ScrubPgIF,
     mutable ceph::mutex m_preemption_lock = ceph::make_mutex("preemption_lock");
     bool m_preemptable{false};
     bool m_preempted{false};
+
+    /// the number of preemptions allowed before we start blocking
+    md_config_cacher_t<uint64_t> osd_scrub_max_preemptions;
     int m_left;
     size_t m_size_divisor{1};
     bool are_preemptions_left() const { return m_left > 0; }