diff options
author | Kefu Chai <kchai@redhat.com> | 2019-06-20 11:30:17 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-06-20 11:30:17 +0200 |
commit | f8102f5c433d47e093823905b28d32939503d56a (patch) | |
tree | 3178f8b7ca804d560cddbf7f4fe90c72a86bb179 | |
parent | Merge pull request #28564 from ceph/store_statfs-log (diff) | |
parent | mgr/progress: Add recovery event when OSD marked in (diff) | |
download | ceph-f8102f5c433d47e093823905b28d32939503d56a.tar.xz ceph-f8102f5c433d47e093823905b28d32939503d56a.zip |
Merge pull request #28498 from kamoltat/wip-mgr-progress-add-event-osd-marked-in
mgr/progress: Add recovery event when OSD marked in
Reviewed-by: Josh Durgin <jdurgin@redhat.com>
-rw-r--r-- | src/pybind/mgr/progress/module.py | 100 |
1 files changed, 52 insertions, 48 deletions
diff --git a/src/pybind/mgr/progress/module.py b/src/pybind/mgr/progress/module.py index e992e0f26eb..dea603702bd 100644 --- a/src/pybind/mgr/progress/module.py +++ b/src/pybind/mgr/progress/module.py @@ -147,8 +147,9 @@ class PgRecoveryEvent(Event): def pg_update(self, pg_dump, log): # FIXME: O(pg_num) in python # FIXME: far more fields getting pythonized than we really care about + # Sanity check to see if there are any missing PGs and to assign + # empty array and dictionary if there hasn't been any recovery pg_to_state = dict([(p['pgid'], p) for p in pg_dump['pg_stats']]) - if self._original_bytes_recovered is None: self._original_bytes_recovered = {} missing_pgs = [] @@ -190,10 +191,7 @@ class PgRecoveryEvent(Event): states = state.split("+") - unmoved = bool(set(self._evacuate_osds) & ( - set(info['up']) | set(info['acting']))) - - if "active" in states and "clean" in states and not unmoved: + if "active" in states and "clean" in states: complete.add(pg) else: if info['stat_sum']['num_bytes'] == 0: @@ -207,7 +205,6 @@ class PgRecoveryEvent(Event): ratio = float(recovered - self._original_bytes_recovered[pg]) / \ total_bytes - # Since the recovered bytes (over time) could perhaps # exceed the contents of the PG (moment in time), we # must clamp this @@ -217,7 +214,6 @@ class PgRecoveryEvent(Event): # Dataless PGs (e.g. containing only OMAPs) count # as half done. ratio = 0.5 - complete_accumulate += ratio self._pgs = list(set(self._pgs) ^ complete) @@ -305,40 +301,50 @@ class Module(MgrModule): self.get_module_option(opt['name'])) self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name'])) - def _osd_out(self, old_map, old_dump, new_map, osd_id): + def _osd_in_out(self, old_map, old_dump, new_map, osd_id, marked): + # A function that will create or complete an event when an + # OSD is marked in or out according to the affected PGs affected_pgs = [] unmoved_pgs = [] for pool in old_dump['pools']: pool_id = pool['pool'] for ps in range(0, pool['pg_num']): - up_acting = old_map.pg_to_up_acting_osds(pool['pool'], ps) - # Was this OSD affected by the OSD going out? - old_osds = set(up_acting['up']) | set(up_acting['acting']) - was_on_out_osd = osd_id in old_osds - if not was_on_out_osd: - continue + # Was this OSD affected by the OSD coming in/out? + # Compare old and new osds using + # data from the json dump + old_up_acting = old_map.pg_to_up_acting_osds(pool['pool'], ps) + old_osds = set(old_up_acting['acting']) + new_up_acting = new_map.pg_to_up_acting_osds(pool['pool'], ps) + new_osds = set(new_up_acting['acting']) + + # Check the osd_id being in the acting set for both old + # and new maps to cover both out and in cases + was_on_out_or_in_osd = osd_id in old_osds or osd_id in new_osds + if not was_on_out_or_in_osd: + continue + self.log.debug("pool_id, ps = {0}, {1}".format( pool_id, ps )) self.log.debug( - "up_acting: {0}".format(json.dumps(up_acting, indent=2))) - - new_up_acting = new_map.pg_to_up_acting_osds(pool['pool'], ps) - new_osds = set(new_up_acting['up']) | set(new_up_acting['acting']) + "old_up_acting: {0}".format(json.dumps(old_up_acting, indent=2))) # Has this OSD been assigned a new location? # (it might not be if there is no suitable place to move - # after an OSD failure) - is_relocated = len(new_osds - old_osds) > 0 + # after an OSD is marked in/out) + if marked == "in": + is_relocated = len(old_osds - new_osds) > 0 + else: + is_relocated = len(new_osds - old_osds) > 0 self.log.debug( "new_up_acting: {0}".format(json.dumps(new_up_acting, indent=2))) - - if was_on_out_osd and is_relocated: + + if was_on_out_or_in_osd and is_relocated: # This PG is now in motion, track its progress affected_pgs.append(PgId(pool_id, ps)) elif not is_relocated: @@ -351,30 +357,28 @@ class Module(MgrModule): self.log.warn("{0} PGs were on osd.{1}, but didn't get new locations".format( len(unmoved_pgs), osd_id)) - self.log.warn("{0} PGs affected by osd.{1} going out".format( - len(affected_pgs), osd_id)) - - if len(affected_pgs) == 0: - # Don't emit events if there were no PGs - return - - # TODO: reconcile with existing events referring to this OSD going out - ev = PgRecoveryEvent( - "Rebalancing after osd.{0} marked out".format(osd_id), - refs=[("osd", osd_id)], - which_pgs=affected_pgs, - evacuate_osds=[osd_id] - ) - ev.pg_update(self.get("pg_dump"), self.log) - self._events[ev.id] = ev - - def _osd_in(self, osd_id): - for ev_id, ev in self._events.items(): - if isinstance(ev, PgRecoveryEvent) and osd_id in ev.evacuating_osds: - self.log.info("osd.{0} came back in, cancelling event".format( - osd_id - )) - self._complete(ev) + self.log.warn("{0} PGs affected by osd.{1} being marked {2}".format( + len(affected_pgs), osd_id, marked)) + + if len(affected_pgs) > 0: + ev = PgRecoveryEvent( + "Rebalancing after osd.{0} marked {1}".format(osd_id, marked), + refs=[("osd", osd_id)], + which_pgs=affected_pgs, + evacuate_osds=[osd_id] + ) + ev.pg_update(self.get("pg_dump"), self.log) + self._events[ev.id] = ev + + # In the case of the osd coming back in, we might need to cancel + # previous recovery event for that osd + if marked == "in": + for ev_id, ev in self._events.items(): + if isinstance(ev, PgRecoveryEvent) and osd_id in ev.evacuating_osds: + self.log.info("osd.{0} came back in, cancelling event".format( + osd_id + )) + self._complete(ev) def _osdmap_changed(self, old_osdmap, new_osdmap): old_dump = old_osdmap.dump() @@ -390,13 +394,13 @@ class Module(MgrModule): if new_weight == 0.0 and old_weight > new_weight: self.log.warn("osd.{0} marked out".format(osd_id)) - self._osd_out(old_osdmap, old_dump, new_osdmap, osd_id) + self._osd_in_out(old_osdmap, old_dump, new_osdmap, osd_id, "out") elif new_weight >= 1.0 and old_weight == 0.0: # Only consider weight>=1.0 as "in" to avoid spawning # individual recovery events on every adjustment # in a gradual weight-in self.log.warn("osd.{0} marked in".format(osd_id)) - self._osd_in(osd_id) + self._osd_in_out(old_osdmap, old_dump, new_osdmap, osd_id, "in") def notify(self, notify_type, notify_data): self._ready.wait() |