src/osd/scrubber/scrub_reservations.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195

// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
#pragma once

#include <cassert>
#include <chrono>
#include <optional>
#include <string_view>
#include <vector>

#include "messages/MOSDScrubReserve.h"
#include "osd/scrubber_common.h"

#include "osd_scrub_sched.h"
#include "scrub_machine_lstnr.h"

namespace Scrub {

using reservation_nonce_t = MOSDScrubReserve::reservation_nonce_t;

/**
 * Reserving/freeing scrub resources at the replicas.
 *
 * When constructed - sends reservation requests to the acting_set OSDs, one
 * by one.
 * Once a replica's OSD replies with a 'grant'ed reservation, we send a
 * reservation request to the next replica.
 * A rejection triggers a "couldn't acquire the replicas' scrub resources"
 * event. All granted reservations are released.
 *
 * Reserved replicas should be released at the end of the scrub session. The
 * one exception is if the scrub terminates upon an interval change. In that
 * scenario - the replicas discard their reservations on their own accord
 * when noticing the change in interval, and there is no need (and no
 * guaranteed way) to send them the release message.
 *
 * Timeouts:
 *
 *  Slow-Secondary Warning:
 *  Warn if a replica takes more than <conf> milliseconds to reply to a
 *  reservation request. Only one warning is issued per session.
 *
 *  Reservation Timeout:
 *  We limit the total time we wait for the replicas to respond to the
 *  reservation request. If the reservation back-and-forth does not complete
 *  within <conf> milliseconds, we give up and release all the reservations
 *  that have been acquired until that moment.
 *  (Why? because we have encountered instances where a reservation request was
 *  lost - either due to a bug or due to a network issue.)
 *
 * Keeping primary & replica in sync:
 *
 * Reservation requests may be canceled by the primary independently of the
 * replica's response. Depending on timing, a cancellation by the primary might
 * or might not be processed by a replica prior to sending a response (either
 * rejection or success).  Thus, we associate each reservation request with a
 * nonce incremented with each reservation during an interval and drop any
 * responses that do not match our current nonce.
 * This check occurs after rejecting any messages from prior intervals, so
 * reusing nonces between intervals is not a problem.  Note that epoch would
 * not suffice as it is possible for this sequence to occur several times
 * without a new map epoch.
 * Note - 'release' messages, which are not replied to by the replica,
 * do not need or use that field.
 */
class ReplicaReservations {
  ScrubMachineListener& m_scrubber;
  PG* m_pg;

  /// shorthand for m_scrubber.get_spgid().pgid
  const pg_t m_pgid;

  /// for dout && when queueing messages to the FSM
  OSDService* m_osds;

  /// the acting set (not including myself), sorted by pg_shard_t
  std::vector<pg_shard_t> m_sorted_secondaries;

  /// the next replica to which we will send a reservation request
  std::vector<pg_shard_t>::const_iterator m_next_to_request;

  /// for logs, and for detecting slow peers
  ScrubTimePoint m_last_request_sent_at;

  /**
   * A ref to PrimaryActive::last_request_sent_nonce.
   * Identifies a specific request sent, to verify against grant/deny
   * responses.
   * See PrimaryActive::last_request_sent_nonce for details.
   */
  reservation_nonce_t& m_last_request_sent_nonce;

  /// access to the performance counters container relevant to this scrub
  /// parameters
  PerfCounters& m_perf_set;

  /// used only for the 'duration of the reservation process' perf counter.
  /// discarded once the success or failure are recorded
  std::optional<ScrubTimePoint> m_process_started_at;

 public:
  ReplicaReservations(
      ScrubMachineListener& scrubber,
      reservation_nonce_t& nonce,
      PerfCounters& pc);

  ~ReplicaReservations();

  /**
   * The OK received from the replica (after verifying that it is indeed
   * the replica we are expecting a reply from) is noted, and triggers
   * one of two: either sending a reservation request to the next replica,
   * or notifying the scrubber that we have reserved them all.
   *
   * \returns true if there are no more replicas to send reservation requests
   * (i.e., the scrubber should proceed to the next phase), false otherwise.
   */
  bool handle_reserve_grant(const MOSDScrubReserve& msg, pg_shard_t from);

  /**
   * React to an incoming reservation rejection.
   *
   * Verify that the sender of the received rejection is the replica we
   * were expecting a reply from, and that the message isn't stale (see
   * m_last_request_sent_nonce for details).
   * If a valid rejection: log it, and mark the fact that the specific peer
   * need not be released.
   *
   * Note - the actual handling of scrub session termination and of
   * releasing the reserved replicas is done by the caller (the FSM).
   *
   * Returns true if the rejection is valid, false otherwise.
   */
  bool handle_reserve_rejection(const MOSDScrubReserve& msg, pg_shard_t from);

  /**
   * Notifies implementation that it is no longer responsible for releasing
   * tracked remote reservations.
   *
   * The intended usage is upon interval change.  In general, replicas are
   * responsible for releasing their own resources upon interval change without
   * coordination from the primary.
   *
   * Sends no messages.
   */
  void discard_remote_reservations();

  /// the only replica we are expecting a reply from
  std::optional<pg_shard_t> get_last_sent() const;

  /**
   * if the start time is still set, i.e. we have not yet marked
   * this as a success or a failure - log its duration as that of a failure.
   */
  void log_failure_and_duration(int failure_cause_counter);

  // note: 'public', as accessed via the 'standard' dout_prefix() macro
  std::ostream& gen_prefix(std::ostream& out, std::string fn) const;

  /// The number of requests that have been sent (and not rejected) so far.
  size_t active_requests_cnt() const;

 private:
  /// send 'release' messages to all replicas we have managed to reserve
  void release_all();

  /**
   * Send a reservation request to the next replica.
   * - if there are no more replicas to send requests to, return true
   */
  bool send_next_reservation_or_complete();

  /**
   * is this is a reply to our last request?
   * Checks response once against m_last_request_sent_nonce. See
   * m_last_request_sent_nonce for details.
   */
  bool is_reservation_response_relevant(reservation_nonce_t msg_nonce) const;

  /**
   * is this reply coming from the expected replica?
   * Now that we check the nonce before checking the sender - this
   * check should never fail.
   */
  bool is_msg_source_correct(pg_shard_t from) const;

  // ---   perf counters helpers

  /**
   * log the duration of the reservation process as that of a success.
   */
  void log_success_and_duration();
};

} // namespace Scrub