mgr/snap_schedule: handle volume delete

Remove references to timers and db connection that refer to paths in the deleted volume. This is a best case effort and not 100% safe. There can be race conditions where the db connection is closed while the module is executing a query on the db while the volume itself is deleted. Hopefully there won't be many tracebacks seen. Fixes: https://tracker.ceph.com/issues/63999 Signed-off-by: Milind Changire <mchangir@redhat.com>
author: Milind Changire <mchangir@redhat.com> 2024-01-11 13:51:31 +0100
committer: Milind Changire <mchangir@redhat.com> 2024-11-04 14:15:10 +0100
commit: 4d2af0c57839418b135e2ef45778fde77e12199a (patch)
tree: 96fcbc8758e0ad4e93e14f90e6a351674ab87c1f /src
parent: Merge pull request #59486 from ceph/ceph-ec-benchmark-increase-default-buffer... (diff)
download: ceph-4d2af0c57839418b135e2ef45778fde77e12199a.tar.xz
ceph-4d2af0c57839418b135e2ef45778fde77e12199a.zip
2 files changed, 42 insertions, 1 deletions
diff --git a/src/pybind/mgr/snap_schedule/fs/schedule_client.py b/src/pybind/mgr/snap_schedule/fs/schedule_client.py
index b58f20f1275..12e5e980737 100644
--- a/src/pybind/mgr/snap_schedule/fs/schedule_client.py
+++ b/src/pybind/mgr/snap_schedule/fs/schedule_client.py
@@ -163,6 +163,7 @@ class SnapSchedClient(CephfsClient):
         self.sqlite_connections: Dict[str, DBInfo] = {}
         self.active_timers: Dict[Tuple[str, str], List[Timer]] = {}
         self.conn_lock: Lock = Lock()  # lock to protect add/lookup db connections
+        self.timers_lock: Lock = Lock()
 
         # restart old schedules
         for fs_name in self.get_all_filesystems():
@@ -273,6 +274,27 @@ class SnapSchedClient(CephfsClient):
                     if self._is_allowed_repeat(r, path)][0:1]
             return rows
 
+    def delete_references_to_unavailable_fs(self, available_fs_names: Set[str]) -> None:
+        fs_to_remove: Set[str] = set()
+        self.timers_lock.acquire()
+        for fs, path in list(self.active_timers.keys()):  # each key is a tuple
+            if fs not in available_fs_names:
+                fs_to_remove.add(fs)
+                log.debug(f'Cancelled timers for "{fs}:{path}"')
+                for t in self.active_timers[(fs, path)]:
+                    t.cancel()
+                log.debug(f'Removed timer instance for "{fs}"')
+                del self.active_timers[(fs, path)]
+        self.timers_lock.release()
+
+        self.conn_lock.acquire()
+        for fs in fs_to_remove:
+            log.debug(f'Closed DB connection to "{fs}"')
+            self.sqlite_connections[fs].db.close()
+            log.debug(f'Removed DB connection to "{fs}"')
+            del self.sqlite_connections[fs]
+        self.conn_lock.release()
+
     def refresh_snap_timers(self, fs: str, path: str, olddb: Optional[sqlite3.Connection] = None) -> None:
         try:
             log.debug((f'SnapDB on {fs} changed for {path}, '
@@ -286,6 +308,7 @@ class SnapSchedClient(CephfsClient):
                 with self.get_schedule_db(fs) as conn_mgr:
                     db = conn_mgr.dbinfo.db
                     rows = self.fetch_schedules(db, path)
+            self.timers_lock.acquire()
             timers = self.active_timers.get((fs, path), [])
             for timer in timers:
                 timer.cancel()
@@ -299,6 +322,7 @@ class SnapSchedClient(CephfsClient):
                 timers.append(t)
                 log.debug(f'Will snapshot {path} in fs {fs} in {row[1]}s')
             self.active_timers[(fs, path)] = timers
+            self.timers_lock.release()
         except Exception:
             self._log_exception('refresh_snap_timers')
 
diff --git a/src/pybind/mgr/snap_schedule/module.py b/src/pybind/mgr/snap_schedule/module.py
index d8f04a62b94..adf982448b1 100644
--- a/src/pybind/mgr/snap_schedule/module.py
+++ b/src/pybind/mgr/snap_schedule/module.py
@@ -8,12 +8,14 @@ import json
 import sqlite3
 from typing import Any, Dict, Optional, Tuple, Union
 from .fs.schedule_client import SnapSchedClient
-from mgr_module import MgrModule, CLIReadCommand, CLIWriteCommand, Option
+from mgr_module import MgrModule, CLIReadCommand, CLIWriteCommand, Option, NotifyType
 from mgr_util import CephfsConnectionException
 from threading import Event
 
 
 class Module(MgrModule):
+    NOTIFY_TYPES = [NotifyType.fs_map]
+
     MODULE_OPTIONS = [
         Option(
             'allow_m_granularity',
@@ -37,6 +39,21 @@ class Module(MgrModule):
         self._initialized = Event()
         self.client = SnapSchedClient(self)
 
+    def notify(self, notify_type: NotifyType, notify_id: str) -> None:
+        if notify_type != NotifyType.fs_map:
+            return
+        fs_map = self.get('fs_map')
+        if not fs_map:
+            return
+
+        # we don't know for which fs config has been changed
+        fs_names = set()
+        for fs in fs_map['filesystems']:
+            fs_name = fs['mdsmap']['fs_name']
+            fs_names.add(fs_name)
+
+        self.client.delete_references_to_unavailable_fs(fs_names)
+
     def _subvolume_exist(self, fs: str, subvol: Union[str, None], group: Union[str, None]) -> bool:
         rc, subvolumes, err = self.remote('volumes', 'subvolume_ls', fs, group)
         if rc == 0:
author	Milind Changire <mchangir@redhat.com>	2024-01-11 13:51:31 +0100
committer	Milind Changire <mchangir@redhat.com>	2024-11-04 14:15:10 +0100
commit	4d2af0c57839418b135e2ef45778fde77e12199a (patch)
tree	96fcbc8758e0ad4e93e14f90e6a351674ab87c1f /src
parent	Merge pull request #59486 from ceph/ceph-ec-benchmark-increase-default-buffer... (diff)
download	ceph-4d2af0c57839418b135e2ef45778fde77e12199a.tar.xz ceph-4d2af0c57839418b135e2ef45778fde77e12199a.zip