diff options
author | Milind Changire <mchangir@redhat.com> | 2024-01-11 13:51:31 +0100 |
---|---|---|
committer | Milind Changire <mchangir@redhat.com> | 2024-11-04 14:15:10 +0100 |
commit | 4d2af0c57839418b135e2ef45778fde77e12199a (patch) | |
tree | 96fcbc8758e0ad4e93e14f90e6a351674ab87c1f /src | |
parent | Merge pull request #59486 from ceph/ceph-ec-benchmark-increase-default-buffer... (diff) | |
download | ceph-4d2af0c57839418b135e2ef45778fde77e12199a.tar.xz ceph-4d2af0c57839418b135e2ef45778fde77e12199a.zip |
mgr/snap_schedule: handle volume delete
Remove references to timers and db connection that refer to paths in the
deleted volume.
This is a best case effort and not 100% safe. There can be race
conditions where the db connection is closed while the module is
executing a query on the db while the volume itself is deleted.
Hopefully there won't be many tracebacks seen.
Fixes: https://tracker.ceph.com/issues/63999
Signed-off-by: Milind Changire <mchangir@redhat.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/pybind/mgr/snap_schedule/fs/schedule_client.py | 24 | ||||
-rw-r--r-- | src/pybind/mgr/snap_schedule/module.py | 19 |
2 files changed, 42 insertions, 1 deletions
diff --git a/src/pybind/mgr/snap_schedule/fs/schedule_client.py b/src/pybind/mgr/snap_schedule/fs/schedule_client.py index b58f20f1275..12e5e980737 100644 --- a/src/pybind/mgr/snap_schedule/fs/schedule_client.py +++ b/src/pybind/mgr/snap_schedule/fs/schedule_client.py @@ -163,6 +163,7 @@ class SnapSchedClient(CephfsClient): self.sqlite_connections: Dict[str, DBInfo] = {} self.active_timers: Dict[Tuple[str, str], List[Timer]] = {} self.conn_lock: Lock = Lock() # lock to protect add/lookup db connections + self.timers_lock: Lock = Lock() # restart old schedules for fs_name in self.get_all_filesystems(): @@ -273,6 +274,27 @@ class SnapSchedClient(CephfsClient): if self._is_allowed_repeat(r, path)][0:1] return rows + def delete_references_to_unavailable_fs(self, available_fs_names: Set[str]) -> None: + fs_to_remove: Set[str] = set() + self.timers_lock.acquire() + for fs, path in list(self.active_timers.keys()): # each key is a tuple + if fs not in available_fs_names: + fs_to_remove.add(fs) + log.debug(f'Cancelled timers for "{fs}:{path}"') + for t in self.active_timers[(fs, path)]: + t.cancel() + log.debug(f'Removed timer instance for "{fs}"') + del self.active_timers[(fs, path)] + self.timers_lock.release() + + self.conn_lock.acquire() + for fs in fs_to_remove: + log.debug(f'Closed DB connection to "{fs}"') + self.sqlite_connections[fs].db.close() + log.debug(f'Removed DB connection to "{fs}"') + del self.sqlite_connections[fs] + self.conn_lock.release() + def refresh_snap_timers(self, fs: str, path: str, olddb: Optional[sqlite3.Connection] = None) -> None: try: log.debug((f'SnapDB on {fs} changed for {path}, ' @@ -286,6 +308,7 @@ class SnapSchedClient(CephfsClient): with self.get_schedule_db(fs) as conn_mgr: db = conn_mgr.dbinfo.db rows = self.fetch_schedules(db, path) + self.timers_lock.acquire() timers = self.active_timers.get((fs, path), []) for timer in timers: timer.cancel() @@ -299,6 +322,7 @@ class SnapSchedClient(CephfsClient): timers.append(t) log.debug(f'Will snapshot {path} in fs {fs} in {row[1]}s') self.active_timers[(fs, path)] = timers + self.timers_lock.release() except Exception: self._log_exception('refresh_snap_timers') diff --git a/src/pybind/mgr/snap_schedule/module.py b/src/pybind/mgr/snap_schedule/module.py index d8f04a62b94..adf982448b1 100644 --- a/src/pybind/mgr/snap_schedule/module.py +++ b/src/pybind/mgr/snap_schedule/module.py @@ -8,12 +8,14 @@ import json import sqlite3 from typing import Any, Dict, Optional, Tuple, Union from .fs.schedule_client import SnapSchedClient -from mgr_module import MgrModule, CLIReadCommand, CLIWriteCommand, Option +from mgr_module import MgrModule, CLIReadCommand, CLIWriteCommand, Option, NotifyType from mgr_util import CephfsConnectionException from threading import Event class Module(MgrModule): + NOTIFY_TYPES = [NotifyType.fs_map] + MODULE_OPTIONS = [ Option( 'allow_m_granularity', @@ -37,6 +39,21 @@ class Module(MgrModule): self._initialized = Event() self.client = SnapSchedClient(self) + def notify(self, notify_type: NotifyType, notify_id: str) -> None: + if notify_type != NotifyType.fs_map: + return + fs_map = self.get('fs_map') + if not fs_map: + return + + # we don't know for which fs config has been changed + fs_names = set() + for fs in fs_map['filesystems']: + fs_name = fs['mdsmap']['fs_name'] + fs_names.add(fs_name) + + self.client.delete_references_to_unavailable_fs(fs_names) + def _subvolume_exist(self, fs: str, subvol: Union[str, None], group: Union[str, None]) -> bool: rc, subvolumes, err = self.remote('volumes', 'subvolume_ls', fs, group) if rc == 0: |