1 files changed, 76 insertions, 22 deletions
diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc
index 09c93ed341e..5eef584c776 100644
--- a/src/crimson/osd/pg_recovery.cc
+++ b/src/crimson/osd/pg_recovery.cc
@@ -24,29 +24,33 @@ namespace {
 
 using std::map;
 using std::set;
+using PglogBasedRecovery = crimson::osd::PglogBasedRecovery;
 
 void PGRecovery::start_pglogbased_recovery()
 {
-  using PglogBasedRecovery = crimson::osd::PglogBasedRecovery;
-  (void) pg->get_shard_services().start_operation<PglogBasedRecovery>(
+  auto [op, fut] = pg->get_shard_services().start_operation<PglogBasedRecovery>(
     static_cast<crimson::osd::PG*>(pg),
     pg->get_shard_services(),
     pg->get_osdmap_epoch(),
     float(0.001));
+  pg->set_pglog_based_recovery_op(op.get());
 }
 
 PGRecovery::interruptible_future<bool>
 PGRecovery::start_recovery_ops(
   RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+  PglogBasedRecovery &recover_op,
   size_t max_to_start)
 {
   assert(pg->is_primary());
   assert(pg->is_peered());
 
-  if (!pg->is_recovering() && !pg->is_backfilling()) {
-    logger().debug("recovery raced and were queued twice, ignoring!");
+  if (pg->has_reset_since(recover_op.get_epoch_started()) ||
+      recover_op.is_cancelled()) {
+    logger().debug("recovery {} cancelled.", recover_op);
     return seastar::make_ready_future<bool>(false);
   }
+  ceph_assert(pg->is_recovering());
 
   // in ceph-osd the do_recovery() path handles both the pg log-based
   // recovery and the backfill, albeit they are separated at the layer
@@ -63,17 +67,18 @@ PGRecovery::start_recovery_ops(
   if (max_to_start > 0) {
     max_to_start -= start_replica_recovery_ops(trigger, max_to_start, &started);
   }
-  using interruptor =
-    crimson::interruptible::interruptor<crimson::osd::IOInterruptCondition>;
   return interruptor::parallel_for_each(started,
 					[] (auto&& ifut) {
     return std::move(ifut);
-  }).then_interruptible([this] {
+  }).then_interruptible([this, &recover_op] {
     //TODO: maybe we should implement a recovery race interruptor in the future
-    if (!pg->is_recovering() && !pg->is_backfilling()) {
-      logger().debug("recovery raced and were queued twice, ignoring!");
+    if (pg->has_reset_since(recover_op.get_epoch_started()) ||
+	recover_op.is_cancelled()) {
+      logger().debug("recovery {} cancelled.", recover_op);
       return seastar::make_ready_future<bool>(false);
     }
+    ceph_assert(pg->is_recovering());
+    ceph_assert(!pg->is_backfilling());
 
     bool done = !pg->get_peering_state().needs_recovery();
     if (done) {
@@ -101,6 +106,7 @@ PGRecovery::start_recovery_ops(
           pg->get_osdmap_epoch(),
           PeeringState::RequestBackfill{});
       }
+      pg->reset_pglog_based_recovery_op();
     }
     return seastar::make_ready_future<bool>(!done);
   });
@@ -520,10 +526,12 @@ void PGRecovery::request_primary_scan(
 
 void PGRecovery::enqueue_push(
   const hobject_t& obj,
-  const eversion_t& v)
+  const eversion_t& v,
+  const std::vector<pg_shard_t> &peers)
 {
-  logger().info("{}: obj={} v={}",
-                 __func__, obj, v);
+  logger().info("{}: obj={} v={} peers={}", __func__, obj, v, peers);
+  auto &peering_state = pg->get_peering_state();
+  peering_state.prepare_backfill_for_missing(obj, v, peers);
   auto [recovering, added] = pg->get_recovery_backend()->add_recovering(obj);
   if (!added)
     return;
@@ -599,8 +607,26 @@ void PGRecovery::update_peers_last_backfill(
 
 bool PGRecovery::budget_available() const
 {
-  // TODO: the limits!
-  return true;
+  crimson::osd::scheduler::params_t params =
+    {1, 0, crimson::osd::scheduler::scheduler_class_t::background_best_effort};
+  auto &ss = pg->get_shard_services();
+  auto futopt = ss.try_acquire_throttle_now(std::move(params));
+  if (!futopt) {
+    return true;
+  }
+  std::ignore = interruptor::make_interruptible(std::move(*futopt)
+  ).then_interruptible([this] {
+    assert(!backfill_state->is_triggered());
+    using BackfillState = crimson::osd::BackfillState;
+    backfill_state->process_event(
+      BackfillState::ThrottleAcquired{}.intrusive_from_this());
+  });
+  return false;
+}
+
+void PGRecovery::on_pg_clean()
+{
+  backfill_state.reset();
 }
 
 void PGRecovery::backfilled()
@@ -615,24 +641,38 @@ void PGRecovery::backfilled()
     PeeringState::Backfilled{});
 }
 
+void PGRecovery::backfill_cancelled()
+{
+  // We are not creating a new BackfillRecovery request here, as we
+  // need to cancel the backfill synchronously (before this method returns).
+  using BackfillState = crimson::osd::BackfillState;
+  backfill_state->process_event(
+    BackfillState::CancelBackfill{}.intrusive_from_this());
+}
+
 void PGRecovery::dispatch_backfill_event(
   boost::intrusive_ptr<const boost::statechart::event_base> evt)
 {
   logger().debug("{}", __func__);
+  assert(backfill_state);
   backfill_state->process_event(evt);
+  // TODO: Do we need to worry about cases in which the pg has
+  //       been through both backfill cancellations and backfill
+  //       restarts between the sendings and replies of
+  //       ReplicaScan/ObjectPush requests? Seems classic OSDs
+  //       doesn't handle these cases.
+}
+
+void PGRecovery::on_activate_complete()
+{
+  logger().debug("{} backfill_state={}",
+                 __func__, fmt::ptr(backfill_state.get()));
+  backfill_state.reset();
 }
 
 void PGRecovery::on_backfill_reserved()
 {
   logger().debug("{}", __func__);
-  // PIMP and depedency injection for the sake unittestability.
-  // I'm not afraid about the performance here.
-  using BackfillState = crimson::osd::BackfillState;
-  backfill_state = std::make_unique<BackfillState>(
-    *this,
-    std::make_unique<crimson::osd::PeeringFacade>(pg->get_peering_state()),
-    std::make_unique<crimson::osd::PGFacade>(
-      *static_cast<crimson::osd::PG*>(pg)));
   // yes, it's **not** backfilling yet. The PG_STATE_BACKFILLING
   // will be set after on_backfill_reserved() returns.
   // Backfill needs to take this into consideration when scheduling
@@ -640,5 +680,19 @@ void PGRecovery::on_backfill_reserved()
   // instances. Otherwise the execution might begin without having
   // the state updated.
   ceph_assert(!pg->get_peering_state().is_backfilling());
+  // let's be lazy with creating the backfill stuff
+  using BackfillState = crimson::osd::BackfillState;
+  if (!backfill_state) {
+    // PIMP and depedency injection for the sake of unittestability.
+    // I'm not afraid about the performance here.
+    backfill_state = std::make_unique<BackfillState>(
+      *this,
+      std::make_unique<crimson::osd::PeeringFacade>(pg->get_peering_state()),
+      std::make_unique<crimson::osd::PGFacade>(
+        *static_cast<crimson::osd::PG*>(pg)));
+  }
+  // it may be we either start a completely new backfill (first
+  // event since last on_activate_complete()) or to resume already
+  // (but stopped one).
   start_backfill_recovery(BackfillState::Triggered{});
 }