summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKefu Chai <kchai@redhat.com>2019-06-20 11:30:17 +0200
committerGitHub <noreply@github.com>2019-06-20 11:30:17 +0200
commitf8102f5c433d47e093823905b28d32939503d56a (patch)
tree3178f8b7ca804d560cddbf7f4fe90c72a86bb179
parentMerge pull request #28564 from ceph/store_statfs-log (diff)
parentmgr/progress: Add recovery event when OSD marked in (diff)
downloadceph-f8102f5c433d47e093823905b28d32939503d56a.tar.xz
ceph-f8102f5c433d47e093823905b28d32939503d56a.zip
Merge pull request #28498 from kamoltat/wip-mgr-progress-add-event-osd-marked-in
mgr/progress: Add recovery event when OSD marked in Reviewed-by: Josh Durgin <jdurgin@redhat.com>
-rw-r--r--src/pybind/mgr/progress/module.py100
1 files changed, 52 insertions, 48 deletions
diff --git a/src/pybind/mgr/progress/module.py b/src/pybind/mgr/progress/module.py
index e992e0f26eb..dea603702bd 100644
--- a/src/pybind/mgr/progress/module.py
+++ b/src/pybind/mgr/progress/module.py
@@ -147,8 +147,9 @@ class PgRecoveryEvent(Event):
def pg_update(self, pg_dump, log):
# FIXME: O(pg_num) in python
# FIXME: far more fields getting pythonized than we really care about
+ # Sanity check to see if there are any missing PGs and to assign
+ # empty array and dictionary if there hasn't been any recovery
pg_to_state = dict([(p['pgid'], p) for p in pg_dump['pg_stats']])
-
if self._original_bytes_recovered is None:
self._original_bytes_recovered = {}
missing_pgs = []
@@ -190,10 +191,7 @@ class PgRecoveryEvent(Event):
states = state.split("+")
- unmoved = bool(set(self._evacuate_osds) & (
- set(info['up']) | set(info['acting'])))
-
- if "active" in states and "clean" in states and not unmoved:
+ if "active" in states and "clean" in states:
complete.add(pg)
else:
if info['stat_sum']['num_bytes'] == 0:
@@ -207,7 +205,6 @@ class PgRecoveryEvent(Event):
ratio = float(recovered -
self._original_bytes_recovered[pg]) / \
total_bytes
-
# Since the recovered bytes (over time) could perhaps
# exceed the contents of the PG (moment in time), we
# must clamp this
@@ -217,7 +214,6 @@ class PgRecoveryEvent(Event):
# Dataless PGs (e.g. containing only OMAPs) count
# as half done.
ratio = 0.5
-
complete_accumulate += ratio
self._pgs = list(set(self._pgs) ^ complete)
@@ -305,40 +301,50 @@ class Module(MgrModule):
self.get_module_option(opt['name']))
self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name']))
- def _osd_out(self, old_map, old_dump, new_map, osd_id):
+ def _osd_in_out(self, old_map, old_dump, new_map, osd_id, marked):
+ # A function that will create or complete an event when an
+ # OSD is marked in or out according to the affected PGs
affected_pgs = []
unmoved_pgs = []
for pool in old_dump['pools']:
pool_id = pool['pool']
for ps in range(0, pool['pg_num']):
- up_acting = old_map.pg_to_up_acting_osds(pool['pool'], ps)
- # Was this OSD affected by the OSD going out?
- old_osds = set(up_acting['up']) | set(up_acting['acting'])
- was_on_out_osd = osd_id in old_osds
- if not was_on_out_osd:
- continue
+ # Was this OSD affected by the OSD coming in/out?
+ # Compare old and new osds using
+ # data from the json dump
+ old_up_acting = old_map.pg_to_up_acting_osds(pool['pool'], ps)
+ old_osds = set(old_up_acting['acting'])
+ new_up_acting = new_map.pg_to_up_acting_osds(pool['pool'], ps)
+ new_osds = set(new_up_acting['acting'])
+
+ # Check the osd_id being in the acting set for both old
+ # and new maps to cover both out and in cases
+ was_on_out_or_in_osd = osd_id in old_osds or osd_id in new_osds
+ if not was_on_out_or_in_osd:
+ continue
+
self.log.debug("pool_id, ps = {0}, {1}".format(
pool_id, ps
))
self.log.debug(
- "up_acting: {0}".format(json.dumps(up_acting, indent=2)))
-
- new_up_acting = new_map.pg_to_up_acting_osds(pool['pool'], ps)
- new_osds = set(new_up_acting['up']) | set(new_up_acting['acting'])
+ "old_up_acting: {0}".format(json.dumps(old_up_acting, indent=2)))
# Has this OSD been assigned a new location?
# (it might not be if there is no suitable place to move
- # after an OSD failure)
- is_relocated = len(new_osds - old_osds) > 0
+ # after an OSD is marked in/out)
+ if marked == "in":
+ is_relocated = len(old_osds - new_osds) > 0
+ else:
+ is_relocated = len(new_osds - old_osds) > 0
self.log.debug(
"new_up_acting: {0}".format(json.dumps(new_up_acting,
indent=2)))
-
- if was_on_out_osd and is_relocated:
+
+ if was_on_out_or_in_osd and is_relocated:
# This PG is now in motion, track its progress
affected_pgs.append(PgId(pool_id, ps))
elif not is_relocated:
@@ -351,30 +357,28 @@ class Module(MgrModule):
self.log.warn("{0} PGs were on osd.{1}, but didn't get new locations".format(
len(unmoved_pgs), osd_id))
- self.log.warn("{0} PGs affected by osd.{1} going out".format(
- len(affected_pgs), osd_id))
-
- if len(affected_pgs) == 0:
- # Don't emit events if there were no PGs
- return
-
- # TODO: reconcile with existing events referring to this OSD going out
- ev = PgRecoveryEvent(
- "Rebalancing after osd.{0} marked out".format(osd_id),
- refs=[("osd", osd_id)],
- which_pgs=affected_pgs,
- evacuate_osds=[osd_id]
- )
- ev.pg_update(self.get("pg_dump"), self.log)
- self._events[ev.id] = ev
-
- def _osd_in(self, osd_id):
- for ev_id, ev in self._events.items():
- if isinstance(ev, PgRecoveryEvent) and osd_id in ev.evacuating_osds:
- self.log.info("osd.{0} came back in, cancelling event".format(
- osd_id
- ))
- self._complete(ev)
+ self.log.warn("{0} PGs affected by osd.{1} being marked {2}".format(
+ len(affected_pgs), osd_id, marked))
+
+ if len(affected_pgs) > 0:
+ ev = PgRecoveryEvent(
+ "Rebalancing after osd.{0} marked {1}".format(osd_id, marked),
+ refs=[("osd", osd_id)],
+ which_pgs=affected_pgs,
+ evacuate_osds=[osd_id]
+ )
+ ev.pg_update(self.get("pg_dump"), self.log)
+ self._events[ev.id] = ev
+
+ # In the case of the osd coming back in, we might need to cancel
+ # previous recovery event for that osd
+ if marked == "in":
+ for ev_id, ev in self._events.items():
+ if isinstance(ev, PgRecoveryEvent) and osd_id in ev.evacuating_osds:
+ self.log.info("osd.{0} came back in, cancelling event".format(
+ osd_id
+ ))
+ self._complete(ev)
def _osdmap_changed(self, old_osdmap, new_osdmap):
old_dump = old_osdmap.dump()
@@ -390,13 +394,13 @@ class Module(MgrModule):
if new_weight == 0.0 and old_weight > new_weight:
self.log.warn("osd.{0} marked out".format(osd_id))
- self._osd_out(old_osdmap, old_dump, new_osdmap, osd_id)
+ self._osd_in_out(old_osdmap, old_dump, new_osdmap, osd_id, "out")
elif new_weight >= 1.0 and old_weight == 0.0:
# Only consider weight>=1.0 as "in" to avoid spawning
# individual recovery events on every adjustment
# in a gradual weight-in
self.log.warn("osd.{0} marked in".format(osd_id))
- self._osd_in(osd_id)
+ self._osd_in_out(old_osdmap, old_dump, new_osdmap, osd_id, "in")
def notify(self, notify_type, notify_data):
self._ready.wait()